/*
Copyright (c) 2000 Eric van der Vlist
4xt.org (http://4xt.org)
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
``Software''), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be included
in all copies or substantial portions of the Software.
The name of the authors when specified in the source files shall be
kept unmodified.
THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, EXPRESS
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL 4XT.ORG BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
/*
* Copyright (c) 1999-2000 by David Brownell. All Rights Reserved.
*
* This program is open source software; you may use, copy, modify, and
* redistribute it under the terms of the LICENSE with which it was
* originally distributed.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* LICENSE for more details.
*/
//
// Copyright (c) 1997, 1998 by Microstar Software Ltd.
// From Microstar's README (the entire original license):
//
// AElfred is free for both commercial and non-commercial use and
// redistribution, provided that Microstar's copyright and disclaimer are
// retained intact. You are free to modify AElfred for your own use and
// to redistribute AElfred with your modifications, provided that the
// modifications are clearly documented.
//
// This program is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of
// merchantability or fitness for a particular purpose. Please use it AT
// YOUR OWN RISK.
//
//
// This program has been originally developped as an illustration
// for an article published on XML.com (http://www.xml.com).
// Built on AElfred2, it shows how it is possible to describe
// the structure of a XML document.
// The modification over the original source are flagged by
// StructSaxDriver
class as your entry point, as all
* internal parser interfaces are subject to change.
*
* @author Written by David Megginson <dmeggins@microstar.com>
* (version 1.2a with bugfixes)
* @author Updated by David Brownell <david-b@pacbell.net>
* @version $Date: 2000/05/29 12:10:24 $
* @see StructSaxDriver
*/
//
You may parse more than one document, but that must be done * sequentially. Only one thread at a time may use this parser. * * @param systemId The URI of the document; should never be null, * but may be so iff a reader or a stream is provided. * @param publicId The public identifier of the document, or null. * @param reader A character stream; must be null if stream isn't. * @param stream A byte input stream; must be null if reader isn't. * @param encoding The suggested encoding, or null if unknown. * @exception java.lang.Exception Basically SAXException or IOException */ // package private void doParse ( String systemId, String publicId, Reader reader, InputStream stream, String encoding ) throws Exception { if (handler == null) throw new IllegalStateException ("no callback handler"); basePublicId = publicId; baseURI = systemId; baseReader = reader; baseInputStream = stream; initializeVariables (); // predeclare the built-in entities here (replacement texts) // we don't need to intern(), since we're guaranteed literals // are always (globally) interned. setInternalEntity ("amp", "&"); setInternalEntity ("lt", "<"); setInternalEntity ("gt", ">"); setInternalEntity ("apos", "'"); setInternalEntity ("quot", """); handler.startDocument (); pushURL ("[document]", basePublicId, baseURI, baseReader, baseInputStream, encoding); try { parseDocument (); handler.endDocument (); } finally { if (baseReader != null) try { baseReader.close (); } catch (IOException e) { /* ignore */ } if (baseInputStream != null) try { baseInputStream.close (); } catch (IOException e) { /* ignore */ } if (is != null) try { is.close (); } catch (IOException e) { /* ignore */ } if (reader != null) try { reader.close (); } catch (IOException e) { /* ignore */ } cleanupVariables (); } } //////////////////////////////////////////////////////////////////////// // Constants. //////////////////////////////////////////////////////////////////////// // // Constants for element content type. // /** * Constant: an element has not been declared. * @see #getElementContentType */ public final static int CONTENT_UNDECLARED = 0; /** * Constant: the element has a content model of ANY. * @see #getElementContentType */ public final static int CONTENT_ANY = 1; /** * Constant: the element has declared content of EMPTY. * @see #getElementContentType */ public final static int CONTENT_EMPTY = 2; /** * Constant: the element has mixed content. * @see #getElementContentType */ public final static int CONTENT_MIXED = 3; /** * Constant: the element has element content. * @see #getElementContentType */ public final static int CONTENT_ELEMENTS = 4; // // Constants for the entity type. // /** * Constant: the entity has not been declared. * @see #getEntityType */ public final static int ENTITY_UNDECLARED = 0; /** * Constant: the entity is internal. * @see #getEntityType */ public final static int ENTITY_INTERNAL = 1; /** * Constant: the entity is external, non-XML data. * @see #getEntityType */ public final static int ENTITY_NDATA = 2; /** * Constant: the entity is external XML data. * @see #getEntityType */ public final static int ENTITY_TEXT = 3; // // Constants for attribute type. // /** * Constant: the attribute has not been declared for this element type. * @see #getAttributeType */ public final static int ATTRIBUTE_UNDECLARED = 0; /** * Constant: the attribute value is a string value. * @see #getAttributeType */ public final static int ATTRIBUTE_CDATA = 1; /** * Constant: the attribute value is a unique identifier. * @see #getAttributeType */ public final static int ATTRIBUTE_ID = 2; /** * Constant: the attribute value is a reference to a unique identifier. * @see #getAttributeType */ public final static int ATTRIBUTE_IDREF = 3; /** * Constant: the attribute value is a list of ID references. * @see #getAttributeType */ public final static int ATTRIBUTE_IDREFS = 4; /** * Constant: the attribute value is the name of an entity. * @see #getAttributeType */ public final static int ATTRIBUTE_ENTITY = 5; /** * Constant: the attribute value is a list of entity names. * @see #getAttributeType */ public final static int ATTRIBUTE_ENTITIES = 6; /** * Constant: the attribute value is a name token. * @see #getAttributeType */ public final static int ATTRIBUTE_NMTOKEN = 7; /** * Constant: the attribute value is a list of name tokens. * @see #getAttributeType */ public final static int ATTRIBUTE_NMTOKENS = 8; /** * Constant: the attribute value is a token from an enumeration. * @see #getAttributeType */ public final static int ATTRIBUTE_ENUMERATED = 9; /** * Constant: the attribute is the name of a notation. * @see #getAttributeType */ public final static int ATTRIBUTE_NOTATION = 10; // // When the class is loaded, populate the hash table of // attribute types. // /** * Hash table of attribute types. */ private static Hashtable attributeTypeHash; static { attributeTypeHash = new Hashtable (13); attributeTypeHash.put ("CDATA", new Integer (ATTRIBUTE_CDATA)); attributeTypeHash.put ("ID", new Integer (ATTRIBUTE_ID)); attributeTypeHash.put ("IDREF", new Integer (ATTRIBUTE_IDREF)); attributeTypeHash.put ("IDREFS", new Integer (ATTRIBUTE_IDREFS)); attributeTypeHash.put ("ENTITY", new Integer (ATTRIBUTE_ENTITY)); attributeTypeHash.put ("ENTITIES", new Integer (ATTRIBUTE_ENTITIES)); attributeTypeHash.put ("NMTOKEN", new Integer (ATTRIBUTE_NMTOKEN)); attributeTypeHash.put ("NMTOKENS", new Integer (ATTRIBUTE_NMTOKENS)); attributeTypeHash.put ("NOTATION", new Integer (ATTRIBUTE_NOTATION)); } // // Constants for supported encodings. "external" is just a flag. // private final static int ENCODING_EXTERNAL = 0; private final static int ENCODING_UTF_8 = 1; private final static int ENCODING_ISO_8859_1 = 2; private final static int ENCODING_UCS_2_12 = 3; private final static int ENCODING_UCS_2_21 = 4; private final static int ENCODING_UCS_4_1234 = 5; private final static int ENCODING_UCS_4_4321 = 6; private final static int ENCODING_UCS_4_2143 = 7; private final static int ENCODING_UCS_4_3412 = 8; private final static int ENCODING_ASCII = 9; // // Constants for attribute default value. // /** * Constant: the attribute is not declared. * @see #getAttributeDefaultValueType */ public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 30; /** * Constant: the attribute has a literal default value specified. * @see #getAttributeDefaultValueType * @see #getAttributeDefaultValue */ public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 31; /** * Constant: the attribute was declared #IMPLIED. * @see #getAttributeDefaultValueType */ public final static int ATTRIBUTE_DEFAULT_IMPLIED = 32; /** * Constant: the attribute was declared #REQUIRED. * @see #getAttributeDefaultValueType */ public final static int ATTRIBUTE_DEFAULT_REQUIRED = 33; /** * Constant: the attribute was declared #FIXED. * @see #getAttributeDefaultValueType * @see #getAttributeDefaultValue */ public final static int ATTRIBUTE_DEFAULT_FIXED = 34; // // Constants for input. // private final static int INPUT_NONE = 0; private final static int INPUT_INTERNAL = 1; private final static int INPUT_EXTERNAL = 2; private final static int INPUT_STREAM = 3; private final static int INPUT_BUFFER = 4; private final static int INPUT_READER = 5; // // Flags for reading literals. // // expand general entity refs (attribute values in dtd and content) private final static int LIT_ENTITY_REF = 2; // normalize this value (whitespace etc) (attributes, public ids) private final static int LIT_NORMALIZE = 4; // literal is an attribute value private final static int LIT_ATTRIBUTE = 8; // don't expand parameter entities private final static int LIT_DISABLE_PE = 16; // don't expand [or parse] character refs private final static int LIT_DISABLE_CREF = 32; // don't parse general entity refs private final static int LIT_DISABLE_EREF = 64; // don't expand general entities, but make sure we _could_ private final static int LIT_ENTITY_CHECK = 128; // // Flags affecting PE handling in DTDs (if expandPE is true). // PEs expand with space padding, except inside literals. // private final static int CONTEXT_NORMAL = 0; private final static int CONTEXT_LITERAL = 1; ////////////////////////////////////////////////////////////////////// // Error reporting. ////////////////////////////////////////////////////////////////////// /** * Report an error. * @param message The error message. * @param textFound The text that caused the error (or null). * @see StructSaxDriver#error * @see #line */ private void error (String message, String textFound, String textExpected) throws SAXException { if (textFound != null) { message = message + " (found \"" + textFound + "\")"; } if (textExpected != null) { message = message + " (expected \"" + textExpected + "\")"; } String uri = null; if (externalEntity != null) { uri = externalEntity.getURL ().toString (); } handler.error (message, uri, line, column); // "can't happen" throw new SAXException (message); } /** * Report a serious error. * @param message The error message. * @param textFound The text that caused the error (or null). */ private void error (String message, char textFound, String textExpected) throws SAXException { error (message, new Character (textFound).toString (), textExpected); } /** Report typical case fatal errors. */ private void error (String message) throws SAXException { error (message, null, null); } ////////////////////////////////////////////////////////////////////// // Major syntactic productions. ////////////////////////////////////////////////////////////////////// /** * Parse an XML document. *
* [1] document ::= prolog element Misc* **
This is the top-level parsing function for a single XML
* document. As a minimum, a well-formed document must have
* a document element, and a valid document must have a prolog
* (one with doctype) as well.
*/
private void parseDocument ()
throws Exception
{
char c;
//
* [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* "-->" **
(The <!--
has already been read.)
*/
private void parseComment ()
throws Exception
{
char c;
boolean saved = expandPE;
expandPE = false;
parseUntil ("--");
require ('>');
expandPE = saved;
//
* [16] PI ::= '<?' PITarget * (S (Char* - (Char* '?>' Char*)))? * '?>' * [17] PITarget ::= Name - ( ('X'|'x') ('M'|m') ('L'|l') ) **
(The <?
has already been read.)
*/
private void parsePI ()
throws SAXException, IOException
{
String name;
boolean saved = expandPE;
expandPE = false;
name = readNmtoken (true);
if ("xml".equalsIgnoreCase (name))
error ("Illegal processing instruction target", name, null);
if (!tryRead ("?>")) {
requireWhitespace ();
parseUntil ("?>");
}
expandPE = saved;
handler.processingInstruction (name, dataBufferToString ());
}
/**
* Parse a CDATA section.
*
* [18] CDSect ::= CDStart CData CDEnd * [19] CDStart ::= '<![CDATA[' * [20] CData ::= (Char* - (Char* ']]>' Char*)) * [21] CDEnd ::= ']]>' **
(The '<![CDATA[' has already been read.) */ private void parseCDSect () throws Exception { parseUntil ("]]>"); dataBufferFlush (); } /** * Parse the prolog of an XML document. *
* [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)? **
There are a couple of tricks here. First, it is necessary to * declare the XML default attributes after the DTD (if present) * has been read. [??] Second, it is not possible to expand general * references in attribute value literals until after the entire * DTD (if present) has been parsed. *
We do not look for the XML declaration here, because it was
* handled by pushURL ().
* @see pushURL
*/
private void parseProlog ()
throws Exception
{
//
* [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' * [24] VersionInfo ::= S 'version' Eq * ("'" VersionNum "'" | '"' VersionNum '"' ) * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')* * [32] SDDecl ::= S 'standalone' Eq * ( "'"" ('yes' | 'no') "'"" | '"' ("yes" | "no") '"' ) * [80] EncodingDecl ::= S 'encoding' Eq * ( "'" EncName "'" | "'" EncName "'" ) * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* **
(The <?xml
and whitespace have already been read.)
* @return the encoding in the declaration, uppercased; or null
* @see #parseTextDecl
* @see #setupDecoding
*/
private String parseXMLDecl (boolean ignoreEncoding)
throws SAXException, IOException
{
boolean white;
int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
// Read the version.
require ("version");
parseEq ();
version = readLiteral (flags);
if (!version.equals ("1.0")) {
error ("unsupported XML version", version, "1.0");
}
// Try reading an encoding declaration.
white = tryWhitespace ();
if (tryRead ("encoding")) {
if (!white)
error ("whitespace required before 'encoding='");
parseEq ();
encodingName = readLiteral (flags);
if (!ignoreEncoding)
setupDecoding (encodingName);
}
// Try reading a standalone declaration
if (encodingName != null)
white = tryWhitespace ();
if (tryRead ("standalone")) {
if (!white)
error ("whitespace required before 'standalone='");
parseEq ();
standalone = readLiteral (flags);
if (! ("yes".equals (standalone) || "no".equals (standalone)))
error ("standalone flag must be 'yes' or 'no'");
}
skipWhitespace ();
require ("?>");
return encodingName;
}
/**
* Parse a text declaration.
*
* [79] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>' * [80] EncodingDecl ::= S 'encoding' Eq * ( '"' EncName '"' | "'" EncName "'" ) * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* **
(The <?xml
' and whitespace have already been read.)
* @return the encoding in the declaration, uppercased; or null
* @see #parseXMLDecl
* @see #setupDecoding
*/
private String parseTextDecl (boolean ignoreEncoding)
throws SAXException, IOException
{
String encodingName = null;
int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
// Read an optional version.
if (tryRead ("version")) {
String version;
parseEq ();
version = readLiteral (flags);
if (!version.equals ("1.0")) {
error ("unsupported XML version", version, "1.0");
}
requireWhitespace ();
}
// Read the encoding.
require ("encoding");
parseEq ();
encodingName = readLiteral (flags);
if (!ignoreEncoding)
setupDecoding (encodingName);
skipWhitespace ();
require ("?>");
return encodingName;
}
/**
* Sets up internal state so that we can decode an entity using the
* specified encoding. This is used when we start to read an entity
* and we have been given knowledge of its encoding before we start to
* read any data (e.g. from a SAX input source or from a MIME type).
*
*
It is also used after autodetection, at which point only very * limited adjustments to the encoding may be used (switching between * related builtin decoders). * * @param encodingName The name of the encoding specified by the user. * @exception IOException if the encoding isn't supported either * internally to this parser, or by the hosting JVM. * @see #parseXMLDecl * @see #parseTextDecl */ private void setupDecoding (String encodingName) throws SAXException, IOException { encodingName = encodingName.toUpperCase (); // ENCODING_EXTERNAL indicates an encoding that wasn't // autodetected ... we can use builtin decoders, or // ones from the JVM (InputStreamReader). // Otherwise we can only tweak what was autodetected, and // only for single byte (ASCII derived) builtin encodings. // ASCII-derived encodings if (encoding == ENCODING_UTF_8 || encoding == ENCODING_EXTERNAL) { if (encodingName.equals ("ISO-8859-1") || encodingName.equals ("8859_1") || encodingName.equals ("ISO8859_1") ) { encoding = ENCODING_ISO_8859_1; return; } else if (encodingName.equals ("US-ASCII") || encodingName.equals ("ASCII")) { encoding = ENCODING_ASCII; return; } else if (encodingName.equals ("UTF-8") || encodingName.equals ("UTF8")) { encoding = ENCODING_UTF_8; return; } else if (encoding != ENCODING_EXTERNAL) { // fatal error error ("unsupported ASCII-derived encoding", encodingName, "UTF-8, US-ASCII, or ISO-8859-1"); } // else fallthrough ... // it's ASCII-ish and something other than a builtin } // Unicode and such if (encoding == ENCODING_UCS_2_12 || encoding == ENCODING_UCS_2_21) { if (!(encodingName.equals ("ISO-10646-UCS-2") || encodingName.equals ("UTF-16") || encodingName.equals ("UTF-16BE") || encodingName.equals ("UTF-16LE"))) error ("unsupported Unicode encoding", encodingName, "UTF-16"); return; } // four byte encodings if (encoding == ENCODING_UCS_4_1234 || encoding == ENCODING_UCS_4_4321 || encoding == ENCODING_UCS_4_2143 || encoding == ENCODING_UCS_4_3412) { if (!encodingName.equals ("ISO-10646-UCS-4")) error ("unsupported 32-bit encoding", encodingName, "ISO-10646-UCS-4"); return; } // assert encoding == ENCODING_EXTERNAL // if (encoding != ENCODING_EXTERNAL) // throw new RuntimeException ("encoding = " + encoding); if (encodingName.equals ("UTF-16BE")) { encoding = ENCODING_UCS_2_12; return; } if (encodingName.equals ("UTF-16LE")) { encoding = ENCODING_UCS_2_21; return; } // We couldn't use the builtin decoders at all. But we can try to // create a reader, since we haven't messed up buffering. Tweak // the encoding name if necessary. if (encodingName.equals ("UTF-16") || encodingName.equals ("ISO-10646-UCS-2")) encodingName = "Unicode"; // Ignoring all the EBCDIC aliases here reader = new InputStreamReader (is, encodingName); sourceType = INPUT_READER; is = null; } /** * Parse miscellaneous markup outside the document element and DOCTYPE * declaration. *
* [27] Misc ::= Comment | PI | S **/ private void parseMisc () throws Exception { while (true) { skipWhitespace (); if (tryRead ("")) { parsePI (); } else if (tryRead ("