diff --git a/StructXmlParser.java b/StructXmlParser.java
new file mode 100644
index 0000000..d70d6a6
--- /dev/null
+++ b/StructXmlParser.java
@@ -0,0 +1,4997 @@
+/*
+Copyright (c) 2000 Eric van der Vlist
+ 4xt.org (http://4xt.org)
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+``Software''), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+The name of the authors when specified in the source files shall be
+kept unmodified.
+
+THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL 4XT.ORG BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+/*
+ * Copyright (c) 1999-2000 by David Brownell. All Rights Reserved.
+ *
+ * This program is open source software; you may use, copy, modify, and
+ * redistribute it under the terms of the LICENSE with which it was
+ * originally distributed.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * LICENSE for more details.
+ */
+
+//
+// Copyright (c) 1997, 1998 by Microstar Software Ltd.
+// From Microstar's README (the entire original license):
+//
+// AElfred is free for both commercial and non-commercial use and
+// redistribution, provided that Microstar's copyright and disclaimer are
+// retained intact. You are free to modify AElfred for your own use and
+// to redistribute AElfred with your modifications, provided that the
+// modifications are clearly documented.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// merchantability or fitness for a particular purpose. Please use it AT
+// YOUR OWN RISK.
+//
+
+//
+// This program has been originally developped as an illustration
+// for an article published on XML.com (http://www.xml.com).
+// Built on AElfred2, it shows how it is possible to describe
+// the structure of a XML document.
+// The modification over the original source are flagged by
+// StructSaxDriver
class as your entry point, as all
+ * internal parser interfaces are subject to change.
+ *
+ * @author Written by David Megginson <dmeggins@microstar.com>
+ * (version 1.2a with bugfixes)
+ * @author Updated by David Brownell <david-b@pacbell.net>
+ * @version $Date: 2000/05/29 12:10:24 $
+ * @see StructSaxDriver
+ */
+//
You may parse more than one document, but that must be done + * sequentially. Only one thread at a time may use this parser. + * + * @param systemId The URI of the document; should never be null, + * but may be so iff a reader or a stream is provided. + * @param publicId The public identifier of the document, or null. + * @param reader A character stream; must be null if stream isn't. + * @param stream A byte input stream; must be null if reader isn't. + * @param encoding The suggested encoding, or null if unknown. + * @exception java.lang.Exception Basically SAXException or IOException + */ + // package private + void doParse ( + String systemId, + String publicId, + Reader reader, + InputStream stream, + String encoding + ) throws Exception + { + if (handler == null) + throw new IllegalStateException ("no callback handler"); + + basePublicId = publicId; + baseURI = systemId; + baseReader = reader; + baseInputStream = stream; + + initializeVariables (); + + // predeclare the built-in entities here (replacement texts) + // we don't need to intern(), since we're guaranteed literals + // are always (globally) interned. + setInternalEntity ("amp", "&"); + setInternalEntity ("lt", "<"); + setInternalEntity ("gt", ">"); + setInternalEntity ("apos", "'"); + setInternalEntity ("quot", """); + + handler.startDocument (); + + pushURL ("[document]", basePublicId, baseURI, + baseReader, baseInputStream, encoding); + + try { + parseDocument (); + handler.endDocument (); + } finally { + if (baseReader != null) + try { baseReader.close (); + } catch (IOException e) { /* ignore */ } + if (baseInputStream != null) + try { baseInputStream.close (); + } catch (IOException e) { /* ignore */ } + if (is != null) + try { is.close (); + } catch (IOException e) { /* ignore */ } + if (reader != null) + try { + reader.close (); + } catch (IOException e) { /* ignore */ + } + cleanupVariables (); + } + } + + + //////////////////////////////////////////////////////////////////////// + // Constants. + //////////////////////////////////////////////////////////////////////// + + // + // Constants for element content type. + // + + /** + * Constant: an element has not been declared. + * @see #getElementContentType + */ + public final static int CONTENT_UNDECLARED = 0; + + /** + * Constant: the element has a content model of ANY. + * @see #getElementContentType + */ + public final static int CONTENT_ANY = 1; + + /** + * Constant: the element has declared content of EMPTY. + * @see #getElementContentType + */ + public final static int CONTENT_EMPTY = 2; + + /** + * Constant: the element has mixed content. + * @see #getElementContentType + */ + public final static int CONTENT_MIXED = 3; + + /** + * Constant: the element has element content. + * @see #getElementContentType + */ + public final static int CONTENT_ELEMENTS = 4; + + + // + // Constants for the entity type. + // + + /** + * Constant: the entity has not been declared. + * @see #getEntityType + */ + public final static int ENTITY_UNDECLARED = 0; + + /** + * Constant: the entity is internal. + * @see #getEntityType + */ + public final static int ENTITY_INTERNAL = 1; + + /** + * Constant: the entity is external, non-XML data. + * @see #getEntityType + */ + public final static int ENTITY_NDATA = 2; + + /** + * Constant: the entity is external XML data. + * @see #getEntityType + */ + public final static int ENTITY_TEXT = 3; + + + // + // Constants for attribute type. + // + + /** + * Constant: the attribute has not been declared for this element type. + * @see #getAttributeType + */ + public final static int ATTRIBUTE_UNDECLARED = 0; + + /** + * Constant: the attribute value is a string value. + * @see #getAttributeType + */ + public final static int ATTRIBUTE_CDATA = 1; + + /** + * Constant: the attribute value is a unique identifier. + * @see #getAttributeType + */ + public final static int ATTRIBUTE_ID = 2; + + /** + * Constant: the attribute value is a reference to a unique identifier. + * @see #getAttributeType + */ + public final static int ATTRIBUTE_IDREF = 3; + + /** + * Constant: the attribute value is a list of ID references. + * @see #getAttributeType + */ + public final static int ATTRIBUTE_IDREFS = 4; + + /** + * Constant: the attribute value is the name of an entity. + * @see #getAttributeType + */ + public final static int ATTRIBUTE_ENTITY = 5; + + /** + * Constant: the attribute value is a list of entity names. + * @see #getAttributeType + */ + public final static int ATTRIBUTE_ENTITIES = 6; + + /** + * Constant: the attribute value is a name token. + * @see #getAttributeType + */ + public final static int ATTRIBUTE_NMTOKEN = 7; + + /** + * Constant: the attribute value is a list of name tokens. + * @see #getAttributeType + */ + public final static int ATTRIBUTE_NMTOKENS = 8; + + /** + * Constant: the attribute value is a token from an enumeration. + * @see #getAttributeType + */ + public final static int ATTRIBUTE_ENUMERATED = 9; + + /** + * Constant: the attribute is the name of a notation. + * @see #getAttributeType + */ + public final static int ATTRIBUTE_NOTATION = 10; + + + // + // When the class is loaded, populate the hash table of + // attribute types. + // + + /** + * Hash table of attribute types. + */ + private static Hashtable attributeTypeHash; + static { + attributeTypeHash = new Hashtable (13); + attributeTypeHash.put ("CDATA", new Integer (ATTRIBUTE_CDATA)); + attributeTypeHash.put ("ID", new Integer (ATTRIBUTE_ID)); + attributeTypeHash.put ("IDREF", new Integer (ATTRIBUTE_IDREF)); + attributeTypeHash.put ("IDREFS", new Integer (ATTRIBUTE_IDREFS)); + attributeTypeHash.put ("ENTITY", new Integer (ATTRIBUTE_ENTITY)); + attributeTypeHash.put ("ENTITIES", new Integer (ATTRIBUTE_ENTITIES)); + attributeTypeHash.put ("NMTOKEN", new Integer (ATTRIBUTE_NMTOKEN)); + attributeTypeHash.put ("NMTOKENS", new Integer (ATTRIBUTE_NMTOKENS)); + attributeTypeHash.put ("NOTATION", new Integer (ATTRIBUTE_NOTATION)); + } + + + // + // Constants for supported encodings. "external" is just a flag. + // + private final static int ENCODING_EXTERNAL = 0; + private final static int ENCODING_UTF_8 = 1; + private final static int ENCODING_ISO_8859_1 = 2; + private final static int ENCODING_UCS_2_12 = 3; + private final static int ENCODING_UCS_2_21 = 4; + private final static int ENCODING_UCS_4_1234 = 5; + private final static int ENCODING_UCS_4_4321 = 6; + private final static int ENCODING_UCS_4_2143 = 7; + private final static int ENCODING_UCS_4_3412 = 8; + private final static int ENCODING_ASCII = 9; + + + // + // Constants for attribute default value. + // + + /** + * Constant: the attribute is not declared. + * @see #getAttributeDefaultValueType + */ + public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 30; + + /** + * Constant: the attribute has a literal default value specified. + * @see #getAttributeDefaultValueType + * @see #getAttributeDefaultValue + */ + public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 31; + + /** + * Constant: the attribute was declared #IMPLIED. + * @see #getAttributeDefaultValueType + */ + public final static int ATTRIBUTE_DEFAULT_IMPLIED = 32; + + /** + * Constant: the attribute was declared #REQUIRED. + * @see #getAttributeDefaultValueType + */ + public final static int ATTRIBUTE_DEFAULT_REQUIRED = 33; + + /** + * Constant: the attribute was declared #FIXED. + * @see #getAttributeDefaultValueType + * @see #getAttributeDefaultValue + */ + public final static int ATTRIBUTE_DEFAULT_FIXED = 34; + + + // + // Constants for input. + // + private final static int INPUT_NONE = 0; + private final static int INPUT_INTERNAL = 1; + private final static int INPUT_EXTERNAL = 2; + private final static int INPUT_STREAM = 3; + private final static int INPUT_BUFFER = 4; + private final static int INPUT_READER = 5; + + + // + // Flags for reading literals. + // + // expand general entity refs (attribute values in dtd and content) + private final static int LIT_ENTITY_REF = 2; + // normalize this value (whitespace etc) (attributes, public ids) + private final static int LIT_NORMALIZE = 4; + // literal is an attribute value + private final static int LIT_ATTRIBUTE = 8; + // don't expand parameter entities + private final static int LIT_DISABLE_PE = 16; + // don't expand [or parse] character refs + private final static int LIT_DISABLE_CREF = 32; + // don't parse general entity refs + private final static int LIT_DISABLE_EREF = 64; + // don't expand general entities, but make sure we _could_ + private final static int LIT_ENTITY_CHECK = 128; + + + // + // Flags affecting PE handling in DTDs (if expandPE is true). + // PEs expand with space padding, except inside literals. + // + private final static int CONTEXT_NORMAL = 0; + private final static int CONTEXT_LITERAL = 1; + + + ////////////////////////////////////////////////////////////////////// + // Error reporting. + ////////////////////////////////////////////////////////////////////// + + + /** + * Report an error. + * @param message The error message. + * @param textFound The text that caused the error (or null). + * @see StructSaxDriver#error + * @see #line + */ + private void error (String message, String textFound, String textExpected) + throws SAXException + { + if (textFound != null) { + message = message + " (found \"" + textFound + "\")"; + } + if (textExpected != null) { + message = message + " (expected \"" + textExpected + "\")"; + } + String uri = null; + + if (externalEntity != null) { + uri = externalEntity.getURL ().toString (); + } + handler.error (message, uri, line, column); + + // "can't happen" + throw new SAXException (message); + } + + + /** + * Report a serious error. + * @param message The error message. + * @param textFound The text that caused the error (or null). + */ + private void error (String message, char textFound, String textExpected) + throws SAXException + { + error (message, new Character (textFound).toString (), textExpected); + } + + /** Report typical case fatal errors. */ + private void error (String message) + throws SAXException + { + error (message, null, null); + } + + + ////////////////////////////////////////////////////////////////////// + // Major syntactic productions. + ////////////////////////////////////////////////////////////////////// + + + /** + * Parse an XML document. + *
+ * [1] document ::= prolog element Misc* + *+ *
This is the top-level parsing function for a single XML
+ * document. As a minimum, a well-formed document must have
+ * a document element, and a valid document must have a prolog
+ * (one with doctype) as well.
+ */
+ private void parseDocument ()
+ throws Exception
+ {
+ char c;
+
+//
+ * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* "-->" + *+ *
(The <!--
has already been read.)
+ */
+ private void parseComment ()
+ throws Exception
+ {
+ char c;
+ boolean saved = expandPE;
+
+ expandPE = false;
+ parseUntil ("--");
+ require ('>');
+ expandPE = saved;
+//
+ * [16] PI ::= '<?' PITarget + * (S (Char* - (Char* '?>' Char*)))? + * '?>' + * [17] PITarget ::= Name - ( ('X'|'x') ('M'|m') ('L'|l') ) + *+ *
(The <?
has already been read.)
+ */
+ private void parsePI ()
+ throws SAXException, IOException
+ {
+ String name;
+ boolean saved = expandPE;
+
+ expandPE = false;
+ name = readNmtoken (true);
+ if ("xml".equalsIgnoreCase (name))
+ error ("Illegal processing instruction target", name, null);
+ if (!tryRead ("?>")) {
+ requireWhitespace ();
+ parseUntil ("?>");
+ }
+ expandPE = saved;
+ handler.processingInstruction (name, dataBufferToString ());
+ }
+
+
+ /**
+ * Parse a CDATA section.
+ *
+ * [18] CDSect ::= CDStart CData CDEnd + * [19] CDStart ::= '<![CDATA[' + * [20] CData ::= (Char* - (Char* ']]>' Char*)) + * [21] CDEnd ::= ']]>' + *+ *
(The '<![CDATA[' has already been read.) + */ + private void parseCDSect () + throws Exception + { + parseUntil ("]]>"); + dataBufferFlush (); + } + + + /** + * Parse the prolog of an XML document. + *
+ * [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)? + *+ *
There are a couple of tricks here. First, it is necessary to + * declare the XML default attributes after the DTD (if present) + * has been read. [??] Second, it is not possible to expand general + * references in attribute value literals until after the entire + * DTD (if present) has been parsed. + *
We do not look for the XML declaration here, because it was
+ * handled by pushURL ().
+ * @see pushURL
+ */
+ private void parseProlog ()
+ throws Exception
+ {
+
+//
+ * [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' + * [24] VersionInfo ::= S 'version' Eq + * ("'" VersionNum "'" | '"' VersionNum '"' ) + * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')* + * [32] SDDecl ::= S 'standalone' Eq + * ( "'"" ('yes' | 'no') "'"" | '"' ("yes" | "no") '"' ) + * [80] EncodingDecl ::= S 'encoding' Eq + * ( "'" EncName "'" | "'" EncName "'" ) + * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* + *+ *
(The <?xml
and whitespace have already been read.)
+ * @return the encoding in the declaration, uppercased; or null
+ * @see #parseTextDecl
+ * @see #setupDecoding
+ */
+ private String parseXMLDecl (boolean ignoreEncoding)
+ throws SAXException, IOException
+ {
+ boolean white;
+ int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
+
+ // Read the version.
+ require ("version");
+ parseEq ();
+ version = readLiteral (flags);
+ if (!version.equals ("1.0")) {
+ error ("unsupported XML version", version, "1.0");
+ }
+
+ // Try reading an encoding declaration.
+ white = tryWhitespace ();
+ if (tryRead ("encoding")) {
+ if (!white)
+ error ("whitespace required before 'encoding='");
+ parseEq ();
+ encodingName = readLiteral (flags);
+ if (!ignoreEncoding)
+ setupDecoding (encodingName);
+ }
+
+ // Try reading a standalone declaration
+ if (encodingName != null)
+ white = tryWhitespace ();
+ if (tryRead ("standalone")) {
+ if (!white)
+ error ("whitespace required before 'standalone='");
+ parseEq ();
+ standalone = readLiteral (flags);
+ if (! ("yes".equals (standalone) || "no".equals (standalone)))
+ error ("standalone flag must be 'yes' or 'no'");
+ }
+
+ skipWhitespace ();
+ require ("?>");
+
+ return encodingName;
+ }
+
+
+ /**
+ * Parse a text declaration.
+ *
+ * [79] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>' + * [80] EncodingDecl ::= S 'encoding' Eq + * ( '"' EncName '"' | "'" EncName "'" ) + * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* + *+ *
(The <?xml
' and whitespace have already been read.)
+ * @return the encoding in the declaration, uppercased; or null
+ * @see #parseXMLDecl
+ * @see #setupDecoding
+ */
+ private String parseTextDecl (boolean ignoreEncoding)
+ throws SAXException, IOException
+ {
+ String encodingName = null;
+ int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
+
+ // Read an optional version.
+ if (tryRead ("version")) {
+ String version;
+ parseEq ();
+ version = readLiteral (flags);
+ if (!version.equals ("1.0")) {
+ error ("unsupported XML version", version, "1.0");
+ }
+ requireWhitespace ();
+ }
+
+
+ // Read the encoding.
+ require ("encoding");
+ parseEq ();
+ encodingName = readLiteral (flags);
+ if (!ignoreEncoding)
+ setupDecoding (encodingName);
+
+ skipWhitespace ();
+ require ("?>");
+
+ return encodingName;
+ }
+
+
+ /**
+ * Sets up internal state so that we can decode an entity using the
+ * specified encoding. This is used when we start to read an entity
+ * and we have been given knowledge of its encoding before we start to
+ * read any data (e.g. from a SAX input source or from a MIME type).
+ *
+ *
It is also used after autodetection, at which point only very + * limited adjustments to the encoding may be used (switching between + * related builtin decoders). + * + * @param encodingName The name of the encoding specified by the user. + * @exception IOException if the encoding isn't supported either + * internally to this parser, or by the hosting JVM. + * @see #parseXMLDecl + * @see #parseTextDecl + */ + private void setupDecoding (String encodingName) + throws SAXException, IOException + { + encodingName = encodingName.toUpperCase (); + + // ENCODING_EXTERNAL indicates an encoding that wasn't + // autodetected ... we can use builtin decoders, or + // ones from the JVM (InputStreamReader). + + // Otherwise we can only tweak what was autodetected, and + // only for single byte (ASCII derived) builtin encodings. + + // ASCII-derived encodings + if (encoding == ENCODING_UTF_8 || encoding == ENCODING_EXTERNAL) { + if (encodingName.equals ("ISO-8859-1") + || encodingName.equals ("8859_1") + || encodingName.equals ("ISO8859_1") + ) { + encoding = ENCODING_ISO_8859_1; + return; + } else if (encodingName.equals ("US-ASCII") + || encodingName.equals ("ASCII")) { + encoding = ENCODING_ASCII; + return; + } else if (encodingName.equals ("UTF-8") + || encodingName.equals ("UTF8")) { + encoding = ENCODING_UTF_8; + return; + } else if (encoding != ENCODING_EXTERNAL) { + // fatal error + error ("unsupported ASCII-derived encoding", + encodingName, + "UTF-8, US-ASCII, or ISO-8859-1"); + } + // else fallthrough ... + // it's ASCII-ish and something other than a builtin + } + + // Unicode and such + if (encoding == ENCODING_UCS_2_12 || encoding == ENCODING_UCS_2_21) { + if (!(encodingName.equals ("ISO-10646-UCS-2") + || encodingName.equals ("UTF-16") + || encodingName.equals ("UTF-16BE") + || encodingName.equals ("UTF-16LE"))) + error ("unsupported Unicode encoding", + encodingName, + "UTF-16"); + return; + } + + // four byte encodings + if (encoding == ENCODING_UCS_4_1234 + || encoding == ENCODING_UCS_4_4321 + || encoding == ENCODING_UCS_4_2143 + || encoding == ENCODING_UCS_4_3412) { + if (!encodingName.equals ("ISO-10646-UCS-4")) + error ("unsupported 32-bit encoding", + encodingName, + "ISO-10646-UCS-4"); + return; + } + + // assert encoding == ENCODING_EXTERNAL + // if (encoding != ENCODING_EXTERNAL) + // throw new RuntimeException ("encoding = " + encoding); + + if (encodingName.equals ("UTF-16BE")) { + encoding = ENCODING_UCS_2_12; + return; + } + if (encodingName.equals ("UTF-16LE")) { + encoding = ENCODING_UCS_2_21; + return; + } + + // We couldn't use the builtin decoders at all. But we can try to + // create a reader, since we haven't messed up buffering. Tweak + // the encoding name if necessary. + + if (encodingName.equals ("UTF-16") + || encodingName.equals ("ISO-10646-UCS-2")) + encodingName = "Unicode"; + // Ignoring all the EBCDIC aliases here + + reader = new InputStreamReader (is, encodingName); + sourceType = INPUT_READER; + is = null; + } + + + /** + * Parse miscellaneous markup outside the document element and DOCTYPE + * declaration. + *
+ * [27] Misc ::= Comment | PI | S + *+ */ + private void parseMisc () + throws Exception + { + while (true) { + skipWhitespace (); + if (tryRead ("")) { + parsePI (); + } else if (tryRead ("