diff --git a/StructXmlParser.java b/StructXmlParser.java new file mode 100644 index 0000000..d70d6a6 --- /dev/null +++ b/StructXmlParser.java @@ -0,0 +1,4997 @@ +/* +Copyright (c) 2000 Eric van der Vlist + 4xt.org (http://4xt.org) + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +``Software''), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +The name of the authors when specified in the source files shall be +kept unmodified. + +THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL 4XT.ORG BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE +USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +/* + * Copyright (c) 1999-2000 by David Brownell. All Rights Reserved. + * + * This program is open source software; you may use, copy, modify, and + * redistribute it under the terms of the LICENSE with which it was + * originally distributed. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * LICENSE for more details. + */ + +// +// Copyright (c) 1997, 1998 by Microstar Software Ltd. +// From Microstar's README (the entire original license): +// +// AElfred is free for both commercial and non-commercial use and +// redistribution, provided that Microstar's copyright and disclaimer are +// retained intact. You are free to modify AElfred for your own use and +// to redistribute AElfred with your modifications, provided that the +// modifications are clearly documented. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// merchantability or fitness for a particular purpose. Please use it AT +// YOUR OWN RISK. +// + +// +// This program has been originally developped as an illustration +// for an article published on XML.com (http://www.xml.com). +// Built on AElfred2, it shows how it is possible to describe +// the structure of a XML document. +// The modification over the original source are flagged by +// tags. + +// +// package org.brownell.xml.aelfred2; +import org.brownell.xml.aelfred2.*; +// + +import java.io.BufferedInputStream; +import java.io.CharConversionException; +import java.io.EOFException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.IOException; +import java.io.Reader; +import java.net.URL; +import java.net.URLConnection; +import java.util.Enumeration; +import java.util.Hashtable; +import java.util.Stack; + +import org.xml.sax.SAXException; +// +import org.xml.sax.SAXNotRecognizedException; +// + +// $Id: XmlParser.java,v 1.20 2000/05/29 12:10:24 mojo Exp $ + +/** + * Parse XML documents and return parse events through call-backs. + * Use the StructSaxDriver class as your entry point, as all + * internal parser interfaces are subject to change. + * + * @author Written by David Megginson <dmeggins@microstar.com> + * (version 1.2a with bugfixes) + * @author Updated by David Brownell <david-b@pacbell.net> + * @version $Date: 2000/05/29 12:10:24 $ + * @see StructSaxDriver + */ +// +// final class XmlParser +final class StructXmlParser +// +{ + // parse from buffer, avoiding slow per-character readCh() + private final static boolean USE_CHEATS = true; + + // don't waste too much space in hashtables + private final static int DEFAULT_ATTR_COUNT = 23; + + + ////////////////////////////////////////////////////////////////////// + // Constructors. + //////////////////////////////////////////////////////////////////////// + + + /** + * Construct a new parser with no associated handler. + * @see #setHandler + * @see #parse + */ + // package private +// +// XmlParser () + StructXmlParser () +// + { + cleanupVariables (); + } + + + /** + * Set the handler that will receive parsing events. + * @param handler The handler to receive callback events. + * @see #parse + */ + // package private +// +// final class XmlParser +// void setHandler (SaxDriver handler) + void setHandler (StructSaxDriver handler) +// + { + this.handler = handler; + } + + + /** + * Parse an XML document from the character stream, byte stream, or URI + * that you provide (in that order of preference). Any URI that you + * supply will become the base URI for resolving relative URI, and may + * be used to acquire a reader or byte stream. + * + *

You may parse more than one document, but that must be done + * sequentially. Only one thread at a time may use this parser. + * + * @param systemId The URI of the document; should never be null, + * but may be so iff a reader or a stream is provided. + * @param publicId The public identifier of the document, or null. + * @param reader A character stream; must be null if stream isn't. + * @param stream A byte input stream; must be null if reader isn't. + * @param encoding The suggested encoding, or null if unknown. + * @exception java.lang.Exception Basically SAXException or IOException + */ + // package private + void doParse ( + String systemId, + String publicId, + Reader reader, + InputStream stream, + String encoding + ) throws Exception + { + if (handler == null) + throw new IllegalStateException ("no callback handler"); + + basePublicId = publicId; + baseURI = systemId; + baseReader = reader; + baseInputStream = stream; + + initializeVariables (); + + // predeclare the built-in entities here (replacement texts) + // we don't need to intern(), since we're guaranteed literals + // are always (globally) interned. + setInternalEntity ("amp", "&"); + setInternalEntity ("lt", "<"); + setInternalEntity ("gt", ">"); + setInternalEntity ("apos", "'"); + setInternalEntity ("quot", """); + + handler.startDocument (); + + pushURL ("[document]", basePublicId, baseURI, + baseReader, baseInputStream, encoding); + + try { + parseDocument (); + handler.endDocument (); + } finally { + if (baseReader != null) + try { baseReader.close (); + } catch (IOException e) { /* ignore */ } + if (baseInputStream != null) + try { baseInputStream.close (); + } catch (IOException e) { /* ignore */ } + if (is != null) + try { is.close (); + } catch (IOException e) { /* ignore */ } + if (reader != null) + try { + reader.close (); + } catch (IOException e) { /* ignore */ + } + cleanupVariables (); + } + } + + + //////////////////////////////////////////////////////////////////////// + // Constants. + //////////////////////////////////////////////////////////////////////// + + // + // Constants for element content type. + // + + /** + * Constant: an element has not been declared. + * @see #getElementContentType + */ + public final static int CONTENT_UNDECLARED = 0; + + /** + * Constant: the element has a content model of ANY. + * @see #getElementContentType + */ + public final static int CONTENT_ANY = 1; + + /** + * Constant: the element has declared content of EMPTY. + * @see #getElementContentType + */ + public final static int CONTENT_EMPTY = 2; + + /** + * Constant: the element has mixed content. + * @see #getElementContentType + */ + public final static int CONTENT_MIXED = 3; + + /** + * Constant: the element has element content. + * @see #getElementContentType + */ + public final static int CONTENT_ELEMENTS = 4; + + + // + // Constants for the entity type. + // + + /** + * Constant: the entity has not been declared. + * @see #getEntityType + */ + public final static int ENTITY_UNDECLARED = 0; + + /** + * Constant: the entity is internal. + * @see #getEntityType + */ + public final static int ENTITY_INTERNAL = 1; + + /** + * Constant: the entity is external, non-XML data. + * @see #getEntityType + */ + public final static int ENTITY_NDATA = 2; + + /** + * Constant: the entity is external XML data. + * @see #getEntityType + */ + public final static int ENTITY_TEXT = 3; + + + // + // Constants for attribute type. + // + + /** + * Constant: the attribute has not been declared for this element type. + * @see #getAttributeType + */ + public final static int ATTRIBUTE_UNDECLARED = 0; + + /** + * Constant: the attribute value is a string value. + * @see #getAttributeType + */ + public final static int ATTRIBUTE_CDATA = 1; + + /** + * Constant: the attribute value is a unique identifier. + * @see #getAttributeType + */ + public final static int ATTRIBUTE_ID = 2; + + /** + * Constant: the attribute value is a reference to a unique identifier. + * @see #getAttributeType + */ + public final static int ATTRIBUTE_IDREF = 3; + + /** + * Constant: the attribute value is a list of ID references. + * @see #getAttributeType + */ + public final static int ATTRIBUTE_IDREFS = 4; + + /** + * Constant: the attribute value is the name of an entity. + * @see #getAttributeType + */ + public final static int ATTRIBUTE_ENTITY = 5; + + /** + * Constant: the attribute value is a list of entity names. + * @see #getAttributeType + */ + public final static int ATTRIBUTE_ENTITIES = 6; + + /** + * Constant: the attribute value is a name token. + * @see #getAttributeType + */ + public final static int ATTRIBUTE_NMTOKEN = 7; + + /** + * Constant: the attribute value is a list of name tokens. + * @see #getAttributeType + */ + public final static int ATTRIBUTE_NMTOKENS = 8; + + /** + * Constant: the attribute value is a token from an enumeration. + * @see #getAttributeType + */ + public final static int ATTRIBUTE_ENUMERATED = 9; + + /** + * Constant: the attribute is the name of a notation. + * @see #getAttributeType + */ + public final static int ATTRIBUTE_NOTATION = 10; + + + // + // When the class is loaded, populate the hash table of + // attribute types. + // + + /** + * Hash table of attribute types. + */ + private static Hashtable attributeTypeHash; + static { + attributeTypeHash = new Hashtable (13); + attributeTypeHash.put ("CDATA", new Integer (ATTRIBUTE_CDATA)); + attributeTypeHash.put ("ID", new Integer (ATTRIBUTE_ID)); + attributeTypeHash.put ("IDREF", new Integer (ATTRIBUTE_IDREF)); + attributeTypeHash.put ("IDREFS", new Integer (ATTRIBUTE_IDREFS)); + attributeTypeHash.put ("ENTITY", new Integer (ATTRIBUTE_ENTITY)); + attributeTypeHash.put ("ENTITIES", new Integer (ATTRIBUTE_ENTITIES)); + attributeTypeHash.put ("NMTOKEN", new Integer (ATTRIBUTE_NMTOKEN)); + attributeTypeHash.put ("NMTOKENS", new Integer (ATTRIBUTE_NMTOKENS)); + attributeTypeHash.put ("NOTATION", new Integer (ATTRIBUTE_NOTATION)); + } + + + // + // Constants for supported encodings. "external" is just a flag. + // + private final static int ENCODING_EXTERNAL = 0; + private final static int ENCODING_UTF_8 = 1; + private final static int ENCODING_ISO_8859_1 = 2; + private final static int ENCODING_UCS_2_12 = 3; + private final static int ENCODING_UCS_2_21 = 4; + private final static int ENCODING_UCS_4_1234 = 5; + private final static int ENCODING_UCS_4_4321 = 6; + private final static int ENCODING_UCS_4_2143 = 7; + private final static int ENCODING_UCS_4_3412 = 8; + private final static int ENCODING_ASCII = 9; + + + // + // Constants for attribute default value. + // + + /** + * Constant: the attribute is not declared. + * @see #getAttributeDefaultValueType + */ + public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 30; + + /** + * Constant: the attribute has a literal default value specified. + * @see #getAttributeDefaultValueType + * @see #getAttributeDefaultValue + */ + public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 31; + + /** + * Constant: the attribute was declared #IMPLIED. + * @see #getAttributeDefaultValueType + */ + public final static int ATTRIBUTE_DEFAULT_IMPLIED = 32; + + /** + * Constant: the attribute was declared #REQUIRED. + * @see #getAttributeDefaultValueType + */ + public final static int ATTRIBUTE_DEFAULT_REQUIRED = 33; + + /** + * Constant: the attribute was declared #FIXED. + * @see #getAttributeDefaultValueType + * @see #getAttributeDefaultValue + */ + public final static int ATTRIBUTE_DEFAULT_FIXED = 34; + + + // + // Constants for input. + // + private final static int INPUT_NONE = 0; + private final static int INPUT_INTERNAL = 1; + private final static int INPUT_EXTERNAL = 2; + private final static int INPUT_STREAM = 3; + private final static int INPUT_BUFFER = 4; + private final static int INPUT_READER = 5; + + + // + // Flags for reading literals. + // + // expand general entity refs (attribute values in dtd and content) + private final static int LIT_ENTITY_REF = 2; + // normalize this value (whitespace etc) (attributes, public ids) + private final static int LIT_NORMALIZE = 4; + // literal is an attribute value + private final static int LIT_ATTRIBUTE = 8; + // don't expand parameter entities + private final static int LIT_DISABLE_PE = 16; + // don't expand [or parse] character refs + private final static int LIT_DISABLE_CREF = 32; + // don't parse general entity refs + private final static int LIT_DISABLE_EREF = 64; + // don't expand general entities, but make sure we _could_ + private final static int LIT_ENTITY_CHECK = 128; + + + // + // Flags affecting PE handling in DTDs (if expandPE is true). + // PEs expand with space padding, except inside literals. + // + private final static int CONTEXT_NORMAL = 0; + private final static int CONTEXT_LITERAL = 1; + + + ////////////////////////////////////////////////////////////////////// + // Error reporting. + ////////////////////////////////////////////////////////////////////// + + + /** + * Report an error. + * @param message The error message. + * @param textFound The text that caused the error (or null). + * @see StructSaxDriver#error + * @see #line + */ + private void error (String message, String textFound, String textExpected) + throws SAXException + { + if (textFound != null) { + message = message + " (found \"" + textFound + "\")"; + } + if (textExpected != null) { + message = message + " (expected \"" + textExpected + "\")"; + } + String uri = null; + + if (externalEntity != null) { + uri = externalEntity.getURL ().toString (); + } + handler.error (message, uri, line, column); + + // "can't happen" + throw new SAXException (message); + } + + + /** + * Report a serious error. + * @param message The error message. + * @param textFound The text that caused the error (or null). + */ + private void error (String message, char textFound, String textExpected) + throws SAXException + { + error (message, new Character (textFound).toString (), textExpected); + } + + /** Report typical case fatal errors. */ + private void error (String message) + throws SAXException + { + error (message, null, null); + } + + + ////////////////////////////////////////////////////////////////////// + // Major syntactic productions. + ////////////////////////////////////////////////////////////////////// + + + /** + * Parse an XML document. + *

+     * [1] document ::= prolog element Misc*
+     * 
+ *

This is the top-level parsing function for a single XML + * document. As a minimum, a well-formed document must have + * a document element, and a valid document must have a prolog + * (one with doctype) as well. + */ + private void parseDocument () + throws Exception + { + char c; + +// + if (showDocStructure) { + + // + // Struct: updated to show the comments + // + + handler.attribute ("xmlns:str", "http://4xt.org/ns/xmlstructure", false); + handler.startElement ("str:document"); + } +// + + parseProlog (); + +// + if (showDocStructure) { + + // + // Struct: updated to show the document structure + // + + handler.startElement ("str:body"); + + } +// + + require ('<'); + parseElement (); + +// + if (showDocStructure) { + + // + // Struct: updated to show the document structure + // + + handler.endElement ("str:body"); + handler.startElement ("str:epilog"); + + } +// + + try { + parseMisc (); //skip all white, PIs, and comments + c = readCh (); //if this doesn't throw an exception... + error ("unexpected characters after document end", c, null); + } catch (EOFException e) { + return; + } + +// + if (showDocStructure) { + + // + // Struct: updated to show the comments + // + + handler.endElement ("str:epilog"); + handler.endElement ("str:document"); + } +// + + + } + + + /** + * Skip a comment. + *

+     * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* "-->"
+     * 
+ *

(The <!-- has already been read.) + */ + private void parseComment () + throws Exception + { + char c; + boolean saved = expandPE; + + expandPE = false; + parseUntil ("--"); + require ('>'); + expandPE = saved; +// + if (! translateComments ) { +// + handler.comment (dataBuffer, 0, dataBufferPos); +// + } else { + // + // Struct: updated to show the comments + // + System.err.println("Comment : " ); + handler.attribute ("xmlns:str", "http://4xt.org/ns/xmlstructure", false); + handler.startElement ("str:comment"); + handler.charData (dataBuffer, 0, dataBufferPos); + handler.endElement ("str:comment"); + + } +// + dataBufferPos = 0; + } + + + /** + * Parse a processing instruction and do a call-back. + *

+     * [16] PI ::= '<?' PITarget
+     *		(S (Char* - (Char* '?>' Char*)))?
+     *		'?>'
+     * [17] PITarget ::= Name - ( ('X'|'x') ('M'|m') ('L'|l') )
+     * 
+ *

(The <? has already been read.) + */ + private void parsePI () + throws SAXException, IOException + { + String name; + boolean saved = expandPE; + + expandPE = false; + name = readNmtoken (true); + if ("xml".equalsIgnoreCase (name)) + error ("Illegal processing instruction target", name, null); + if (!tryRead ("?>")) { + requireWhitespace (); + parseUntil ("?>"); + } + expandPE = saved; + handler.processingInstruction (name, dataBufferToString ()); + } + + + /** + * Parse a CDATA section. + *

+     * [18] CDSect ::= CDStart CData CDEnd
+     * [19] CDStart ::= '<![CDATA['
+     * [20] CData ::= (Char* - (Char* ']]>' Char*))
+     * [21] CDEnd ::= ']]>'
+     * 
+ *

(The '<![CDATA[' has already been read.) + */ + private void parseCDSect () + throws Exception + { + parseUntil ("]]>"); + dataBufferFlush (); + } + + + /** + * Parse the prolog of an XML document. + *

+     * [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?
+     * 
+ *

There are a couple of tricks here. First, it is necessary to + * declare the XML default attributes after the DTD (if present) + * has been read. [??] Second, it is not possible to expand general + * references in attribute value literals until after the entire + * DTD (if present) has been parsed. + *

We do not look for the XML declaration here, because it was + * handled by pushURL (). + * @see pushURL + */ + private void parseProlog () + throws Exception + { + +// + if (showDocStructure) { + + // + // Struct: updated to show the prolog + // + + handler.startElement ("str:prolog"); + if (version != null) + handler.attribute ("str:version", version, false); + if (encodingName != null) + handler.attribute ("str:encoding", encodingName, false); + if (standalone != null) + handler.attribute ("str:standalone", standalone, false); + handler.startElement ("str:X-M-L-Decl"); + handler.endElement ("str:X-M-L-Decl"); + + } +// + + parseMisc (); + + if (tryRead (" + if (showDocStructure) { + + // + // Struct: updated to show the prolog + // + + handler.endElement ("str:prolog"); + } +// + + } + + + /** + * Parse the XML declaration. + *

+     * [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
+     * [24] VersionInfo ::= S 'version' Eq
+     *		("'" VersionNum "'" | '"' VersionNum '"' )
+     * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')*
+     * [32] SDDecl ::= S 'standalone' Eq
+     *		( "'"" ('yes' | 'no') "'"" | '"' ("yes" | "no") '"' )
+     * [80] EncodingDecl ::= S 'encoding' Eq
+     *		( "'" EncName "'" | "'" EncName "'" )
+     * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
+     * 
+ *

(The <?xml and whitespace have already been read.) + * @return the encoding in the declaration, uppercased; or null + * @see #parseTextDecl + * @see #setupDecoding + */ + private String parseXMLDecl (boolean ignoreEncoding) + throws SAXException, IOException + { + boolean white; + int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; + + // Read the version. + require ("version"); + parseEq (); + version = readLiteral (flags); + if (!version.equals ("1.0")) { + error ("unsupported XML version", version, "1.0"); + } + + // Try reading an encoding declaration. + white = tryWhitespace (); + if (tryRead ("encoding")) { + if (!white) + error ("whitespace required before 'encoding='"); + parseEq (); + encodingName = readLiteral (flags); + if (!ignoreEncoding) + setupDecoding (encodingName); + } + + // Try reading a standalone declaration + if (encodingName != null) + white = tryWhitespace (); + if (tryRead ("standalone")) { + if (!white) + error ("whitespace required before 'standalone='"); + parseEq (); + standalone = readLiteral (flags); + if (! ("yes".equals (standalone) || "no".equals (standalone))) + error ("standalone flag must be 'yes' or 'no'"); + } + + skipWhitespace (); + require ("?>"); + + return encodingName; + } + + + /** + * Parse a text declaration. + *

+     * [79] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>'
+     * [80] EncodingDecl ::= S 'encoding' Eq
+     *		( '"' EncName '"' | "'" EncName "'" )
+     * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
+     * 
+ *

(The <?xml' and whitespace have already been read.) + * @return the encoding in the declaration, uppercased; or null + * @see #parseXMLDecl + * @see #setupDecoding + */ + private String parseTextDecl (boolean ignoreEncoding) + throws SAXException, IOException + { + String encodingName = null; + int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; + + // Read an optional version. + if (tryRead ("version")) { + String version; + parseEq (); + version = readLiteral (flags); + if (!version.equals ("1.0")) { + error ("unsupported XML version", version, "1.0"); + } + requireWhitespace (); + } + + + // Read the encoding. + require ("encoding"); + parseEq (); + encodingName = readLiteral (flags); + if (!ignoreEncoding) + setupDecoding (encodingName); + + skipWhitespace (); + require ("?>"); + + return encodingName; + } + + + /** + * Sets up internal state so that we can decode an entity using the + * specified encoding. This is used when we start to read an entity + * and we have been given knowledge of its encoding before we start to + * read any data (e.g. from a SAX input source or from a MIME type). + * + *

It is also used after autodetection, at which point only very + * limited adjustments to the encoding may be used (switching between + * related builtin decoders). + * + * @param encodingName The name of the encoding specified by the user. + * @exception IOException if the encoding isn't supported either + * internally to this parser, or by the hosting JVM. + * @see #parseXMLDecl + * @see #parseTextDecl + */ + private void setupDecoding (String encodingName) + throws SAXException, IOException + { + encodingName = encodingName.toUpperCase (); + + // ENCODING_EXTERNAL indicates an encoding that wasn't + // autodetected ... we can use builtin decoders, or + // ones from the JVM (InputStreamReader). + + // Otherwise we can only tweak what was autodetected, and + // only for single byte (ASCII derived) builtin encodings. + + // ASCII-derived encodings + if (encoding == ENCODING_UTF_8 || encoding == ENCODING_EXTERNAL) { + if (encodingName.equals ("ISO-8859-1") + || encodingName.equals ("8859_1") + || encodingName.equals ("ISO8859_1") + ) { + encoding = ENCODING_ISO_8859_1; + return; + } else if (encodingName.equals ("US-ASCII") + || encodingName.equals ("ASCII")) { + encoding = ENCODING_ASCII; + return; + } else if (encodingName.equals ("UTF-8") + || encodingName.equals ("UTF8")) { + encoding = ENCODING_UTF_8; + return; + } else if (encoding != ENCODING_EXTERNAL) { + // fatal error + error ("unsupported ASCII-derived encoding", + encodingName, + "UTF-8, US-ASCII, or ISO-8859-1"); + } + // else fallthrough ... + // it's ASCII-ish and something other than a builtin + } + + // Unicode and such + if (encoding == ENCODING_UCS_2_12 || encoding == ENCODING_UCS_2_21) { + if (!(encodingName.equals ("ISO-10646-UCS-2") + || encodingName.equals ("UTF-16") + || encodingName.equals ("UTF-16BE") + || encodingName.equals ("UTF-16LE"))) + error ("unsupported Unicode encoding", + encodingName, + "UTF-16"); + return; + } + + // four byte encodings + if (encoding == ENCODING_UCS_4_1234 + || encoding == ENCODING_UCS_4_4321 + || encoding == ENCODING_UCS_4_2143 + || encoding == ENCODING_UCS_4_3412) { + if (!encodingName.equals ("ISO-10646-UCS-4")) + error ("unsupported 32-bit encoding", + encodingName, + "ISO-10646-UCS-4"); + return; + } + + // assert encoding == ENCODING_EXTERNAL + // if (encoding != ENCODING_EXTERNAL) + // throw new RuntimeException ("encoding = " + encoding); + + if (encodingName.equals ("UTF-16BE")) { + encoding = ENCODING_UCS_2_12; + return; + } + if (encodingName.equals ("UTF-16LE")) { + encoding = ENCODING_UCS_2_21; + return; + } + + // We couldn't use the builtin decoders at all. But we can try to + // create a reader, since we haven't messed up buffering. Tweak + // the encoding name if necessary. + + if (encodingName.equals ("UTF-16") + || encodingName.equals ("ISO-10646-UCS-2")) + encodingName = "Unicode"; + // Ignoring all the EBCDIC aliases here + + reader = new InputStreamReader (is, encodingName); + sourceType = INPUT_READER; + is = null; + } + + + /** + * Parse miscellaneous markup outside the document element and DOCTYPE + * declaration. + *

+     * [27] Misc ::= Comment | PI | S
+     * 
+ */ + private void parseMisc () + throws Exception + { + while (true) { + skipWhitespace (); + if (tryRead (" + * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? + * ('[' (markupdecl | PEReference | S)* ']' S?)? '>' + * + *

(The <!DOCTYPE has already been read.) + */ + private void parseDoctypedecl () + throws Exception + { + char c; + String doctypeName, ids[]; + + // Read the document type name. + requireWhitespace (); + doctypeName = readNmtoken (true); + + // Read the External subset's IDs + skipWhitespace (); + ids = readExternalIds (false); + + // report (a) declaration of name, (b) lexical info (ids) + handler.doctypeDecl (doctypeName, ids [0], ids [1]); + +// + if (showDocStructure) { + + // + // Struct: updated to show the doctype + // + + if (doctypeName != null) + handler.attribute ("str:name", doctypeName, false); + if (ids [0] != null) + handler.attribute ("str:publicId", ids [0], false); + if (ids [1] != null) + handler.attribute ("str:systemId", ids [1], false); + handler.startElement ("str:doctype"); + + } +// + + // Internal subset is parsed first, if present + skipWhitespace (); + if (tryRead ('[')) { + + // loop until the subset ends + while (true) { + expandPE = true; + skipWhitespace (); + expandPE = false; + if (tryRead (']')) { + break; // end of subset + } else { + // WFC, PEs in internal subset (only between decls) + peIsError = expandPE = true; + parseMarkupdecl (); + peIsError = expandPE = false; + } + } + } + + // Read the external subset, if any + if (ids [1] != null) { + pushURL ("[external subset]", ids [0], ids [1], null, null, null); + + // Loop until we end up back at '>' + while (true) { + expandPE = true; + skipWhitespace (); + expandPE = false; + if (tryRead ('>')) { + break; + } else { + expandPE = true; + parseMarkupdecl (); + expandPE = false; + } + } + } else { + // No external subset. + skipWhitespace (); + require ('>'); + } + + // done dtd + handler.endDoctype (); + expandPE = false; + +// + if (showDocStructure) { + + // + // Struct: updated to show the doctype + // + + handler.endElement ("str:doctype"); + + } +// + + + } + + + /** + * Parse a markup declaration in the internal or external DTD subset. + *

+     * [29] markupdecl ::= elementdecl | Attlistdecl | EntityDecl
+     *		| NotationDecl | PI | Comment
+     * [30] extSubsetDecl ::= (markupdecl | conditionalSect
+     *		| PEReference | S) *
+     * 
+ *

Reading toplevel PE references is handled as a lexical issue + * by the caller, as is whitespace. + */ + private void parseMarkupdecl () + throws Exception + { + if (tryRead (" 0) + parseConditionalSect (); + else + error ("conditional sections illegal in internal subset"); + } else { + error ("expected markup declaration"); + } + } + + + /** + * Parse an element, with its tags. + *

+     * [39] element ::= EmptyElementTag | STag content ETag
+     * [40] STag ::= '<' Name (S Attribute)* S? '>'
+     * [44] EmptyElementTag ::= '<' Name (S Attribute)* S? '/>'
+     * 
+ *

(The '<' has already been read.) + *

NOTE: this method actually chains onto parseContent (), if necessary, + * and parseContent () will take care of calling parseETag (). + */ + private void parseElement () + throws Exception + { + String gi; + char c; + int oldElementContent = currentElementContent; + String oldElement = currentElement; + Object element []; + + // This is the (global) counter for the + // array of specified attributes. + tagAttributePos = 0; + + // Read the element type name. + gi = readNmtoken (true); + + // Determine the current content type. + currentElement = gi; + element = (Object []) elementInfo.get (gi); + currentElementContent = getContentType (element, CONTENT_ANY); + + // Read the attributes, if any. + // After this loop, "c" is the closing delimiter. + boolean white = tryWhitespace (); + c = readCh (); + while (c != '/' && c != '>') { + unread (c); + if (!white) + error ("need whitespace between attributes"); + parseAttribute (gi); + white = tryWhitespace (); + c = readCh (); + } + + // Supply any defaulted attributes. + Enumeration atts = declaredAttributes (element); + if (atts != null) { + String aname; +loop: + while (atts.hasMoreElements ()) { + aname = (String) atts.nextElement (); + // See if it was specified. + for (int i = 0; i < tagAttributePos; i++) { + if (tagAttributes [i] == aname) { + continue loop; + } + } + // I guess not... + handler.attribute (aname, + getAttributeExpandedValue (gi, aname), + false); + } + } + + // Figure out if this is a start tag + // or an empty element, and dispatch an + // event accordingly. + switch (c) { + case '>': + handler.startElement (gi); + parseContent (); + break; + case '/': + require ('>'); + handler.startElement (gi); + handler.endElement (gi); + break; + } + + // Restore the previous state. + currentElement = oldElement; + currentElementContent = oldElementContent; + } + + + /** + * Parse an attribute assignment. + *

+     * [41] Attribute ::= Name Eq AttValue
+     * 
+ * @param name The name of the attribute's element. + * @see StructSaxDriver#attribute + */ + private void parseAttribute (String name) + throws Exception + { + String aname; + int type; + String value; + int flags = LIT_ATTRIBUTE | LIT_ENTITY_REF; + + // Read the attribute name. + aname = readNmtoken (true); + type = getAttributeType (name, aname); + + // Parse '=' + parseEq (); + + // Read the value, normalizing whitespace + // unless it is CDATA. + if (type == ATTRIBUTE_CDATA || type == ATTRIBUTE_UNDECLARED) { + value = readLiteral (flags); + } else { + value = readLiteral (flags | LIT_NORMALIZE); + } + + // WFC: no duplicate attributes + for (int i = 0; i < tagAttributePos; i++) + if (aname.equals (tagAttributes [i])) + error ("duplicate attribute", aname, null); + + // Inform the handler about the + // attribute. + handler.attribute (aname, value, true); + dataBufferPos = 0; + + // Note that the attribute has been + // specified. + if (tagAttributePos == tagAttributes.length) { + String newAttrib[] = new String [tagAttributes.length * 2]; + System.arraycopy (tagAttributes, 0, newAttrib, 0, tagAttributePos); + tagAttributes = newAttrib; + } + tagAttributes [tagAttributePos++] = aname; + } + + + /** + * Parse an equals sign surrounded by optional whitespace. + *
+     * [25] Eq ::= S? '=' S?
+     * 
+ */ + private void parseEq () + throws SAXException, IOException + { + skipWhitespace (); + require ('='); + skipWhitespace (); + } + + + /** + * Parse an end tag. + *
+     * [42] ETag ::= ''
+     * 
+ *

NOTE: parseContent () chains to here, we already read the + * "</". + */ + private void parseETag () + throws Exception + { + require (currentElement); + skipWhitespace (); + require ('>'); + handler.endElement (currentElement); + // not re-reporting any SAXException re bogus end tags, + // even though that diagnostic might be clearer ... + } + + + /** + * Parse the content of an element. + *

+     * [43] content ::= (element | CharData | Reference
+     *		| CDSect | PI | Comment)*
+     * [67] Reference ::= EntityRef | CharRef
+     * 
+ *

NOTE: consumes ETtag. + */ + private void parseContent () + throws Exception + { + String data; + char c; + + while (true) { + + switch (currentElementContent) { + case CONTENT_ANY: + case CONTENT_MIXED: + parseCharData (); + break; + case CONTENT_ELEMENTS: + parseWhitespace (); + break; + } + + // Handle delimiters + c = readCh (); + switch (c) { + + case '&': // Found "&" + + c = readCh (); + if (c == '#') { + parseCharRef (); + } else { + unread (c); + parseEntityRef (true); + } + break; + + case '<': // Found "<" + dataBufferFlush (); + c = readCh (); + switch (c) { + case '!': // Found " + * [45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>' + * + *

NOTE: the '<!ELEMENT' has already been read. + */ + private void parseElementdecl () + throws Exception + { + String name; + + requireWhitespace (); + // Read the element type name. + name = readNmtoken (true); + + requireWhitespace (); + // Read the content model. + parseContentspec (name); + + skipWhitespace (); + require ('>'); + } + + + /** + * Content specification. + *

+     * [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements
+     * 
+ */ + private void parseContentspec (String name) + throws Exception + { + if (tryRead ("EMPTY")) { + setElement (name, CONTENT_EMPTY, null); + return; + } else if (tryRead ("ANY")) { + setElement (name, CONTENT_ANY, null); + return; + } else { + require ('('); + dataBufferAppend ('('); + skipWhitespace (); + if (tryRead ("#PCDATA")) { + dataBufferAppend ("#PCDATA"); + parseMixed (); + setElement (name, CONTENT_MIXED, dataBufferToString ()); + } else { + parseElements (); + setElement (name, CONTENT_ELEMENTS, dataBufferToString ()); + } + } + } + + + /** + * Parse an element-content model. + *
+     * [47] elements ::= (choice | seq) ('?' | '*' | '+')?
+     * [49] choice ::= '(' S? cp (S? '|' S? cp)+ S? ')'
+     * [50] seq ::= '(' S? cp (S? ',' S? cp)* S? ')'
+     * 
+ * + *

NOTE: the opening '(' and S have already been read. + */ + private void parseElements () + throws Exception + { + char c; + char sep; + + // Parse the first content particle + skipWhitespace (); + parseCp (); + + // Check for end or for a separator. + skipWhitespace (); + c = readCh (); + switch (c) { + case ')': + dataBufferAppend (')'); + c = readCh (); + switch (c) { + case '*': + case '+': + case '?': + dataBufferAppend (c); + break; + default: + unread (c); + } + return; + case ',': // Register the separator. + case '|': + sep = c; + dataBufferAppend (c); + break; + default: + error ("bad separator in content model", c, null); + return; + } + + // Parse the rest of the content model. + while (true) { + skipWhitespace (); + parseCp (); + skipWhitespace (); + c = readCh (); + if (c == ')') { + dataBufferAppend (')'); + break; + } else if (c != sep) { + error ("bad separator in content model", c, null); + return; + } else { + dataBufferAppend (c); + } + } + + // Check for the occurrence indicator. + c = readCh (); + switch (c) { + case '?': + case '*': + case '+': + dataBufferAppend (c); + return; + default: + unread (c); + return; + } + } + + + /** + * Parse a content particle. + *

+     * [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')?
+     * 
+ */ + private void parseCp () + throws Exception + { + char c; + + if (tryRead ('(')) { + dataBufferAppend ('('); + parseElements (); + } else { + dataBufferAppend (readNmtoken (true)); + c = readCh (); + switch (c) { + case '?': + case '*': + case '+': + dataBufferAppend (c); + break; + default: + unread (c); + break; + } + } + } + + + /** + * Parse mixed content. + *
+     * [51] Mixed ::= '(' S? ( '#PCDATA' (S? '|' S? Name)*) S? ')*'
+     *	      | '(' S? ('#PCDATA') S? ')'
+     * 
+ */ + private void parseMixed () + throws Exception + { + char c; + + // Check for PCDATA alone. + skipWhitespace (); + if (tryRead (')')) { + dataBufferAppend (")*"); + tryRead ('*'); + return; + } + + // Parse mixed content. + skipWhitespace (); + while (!tryRead (")*")) { + require ('|'); + dataBufferAppend ('|'); + skipWhitespace (); + dataBufferAppend (readNmtoken (true)); + skipWhitespace (); + } + dataBufferAppend (")*"); + } + + + /** + * Parse an attribute list declaration. + *
+     * [52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
+     * 
+ *

NOTE: the '<!ATTLIST' has already been read. + */ + private void parseAttlistDecl () + throws Exception + { + String elementName; + + requireWhitespace (); + elementName = readNmtoken (true); + boolean white = tryWhitespace (); + while (!tryRead ('>')) { + if (!white) + error ("whitespace required before attribute definition"); + parseAttDef (elementName); + white = tryWhitespace (); + } + } + + + /** + * Parse a single attribute definition. + *

+     * [53] AttDef ::= S Name S AttType S DefaultDecl
+     * 
+ */ + private void parseAttDef (String elementName) + throws Exception + { + String name; + int type; + String enum = null; + + // Read the attribute name. + name = readNmtoken (true); + + // Read the attribute type. + requireWhitespace (); + type = readAttType (); + + // Get the string of enumerated values + // if necessary. + if (type == ATTRIBUTE_ENUMERATED || type == ATTRIBUTE_NOTATION) { + enum = dataBufferToString (); + } + + // Read the default value. + requireWhitespace (); + parseDefault (elementName, name, type, enum); + } + + + /** + * Parse the attribute type. + *
+     * [54] AttType ::= StringType | TokenizedType | EnumeratedType
+     * [55] StringType ::= 'CDATA'
+     * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY'
+     *		| 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS'
+     * [57] EnumeratedType ::= NotationType | Enumeration
+     * 
+ */ + private int readAttType () + throws Exception + { + String typeString; + Integer type; + + if (tryRead ('(')) { + parseEnumeration (false); + return ATTRIBUTE_ENUMERATED; + } else { + typeString = readNmtoken (true); + if (typeString.equals ("NOTATION")) { + parseNotationType (); + } + type = (Integer) attributeTypeHash.get (typeString); + if (type == null) { + error ("illegal attribute type", typeString, null); + return ATTRIBUTE_UNDECLARED; + } else { + return type.intValue (); + } + } + } + + + /** + * Parse an enumeration. + *
+     * [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
+     * 
+ *

NOTE: the '(' has already been read. + */ + private void parseEnumeration (boolean isNames) + throws Exception + { + char c; + + dataBufferAppend ('('); + + // Read the first token. + skipWhitespace (); + dataBufferAppend (readNmtoken (isNames)); + // Read the remaining tokens. + skipWhitespace (); + while (!tryRead (')')) { + require ('|'); + dataBufferAppend ('|'); + skipWhitespace (); + dataBufferAppend (readNmtoken (isNames)); + skipWhitespace (); + } + dataBufferAppend (')'); + } + + + /** + * Parse a notation type for an attribute. + *

+     * [58] NotationType ::= 'NOTATION' S '(' S? NameNtoks
+     *		(S? '|' S? name)* S? ')'
+     * 
+ *

NOTE: the 'NOTATION' has already been read + */ + private void parseNotationType () + throws Exception + { + requireWhitespace (); + require ('('); + + parseEnumeration (true); + } + + + /** + * Parse the default value for an attribute. + *

+     * [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
+     *		| (('#FIXED' S)? AttValue)
+     * 
+ */ + private void parseDefault ( + String elementName, + String name, + int type, + String enum + ) throws Exception + { + int valueType = ATTRIBUTE_DEFAULT_SPECIFIED; + String value = null; + int flags = LIT_ATTRIBUTE | LIT_DISABLE_CREF | LIT_ENTITY_CHECK; + + // Note: char refs not checked here, and input not normalized, + // since it's done correctly later when we actually expand any + // entity refs. We ought to report char ref syntax errors now, + // but don't. Cost: unused defaults mean unreported WF errs. + + // LIT_ATTRIBUTE forces '<' checks now (ASAP) and turns whitespace + // chars to spaces (doesn't matter when that's done if it doesn't + // interfere with char refs expanding to whitespace). + + if (tryRead ('#')) { + if (tryRead ("FIXED")) { + valueType = ATTRIBUTE_DEFAULT_FIXED; + requireWhitespace (); + value = readLiteral (flags); + } else if (tryRead ("REQUIRED")) { + valueType = ATTRIBUTE_DEFAULT_REQUIRED; + } else if (tryRead ("IMPLIED")) { + valueType = ATTRIBUTE_DEFAULT_IMPLIED; + } else { + error ("illegal keyword for attribute default value"); + } + } else + value = readLiteral (flags); + setAttribute (elementName, name, type, enum, value, valueType); + } + + + /** + * Parse a conditional section. + *
+     * [61] conditionalSect ::= includeSect || ignoreSect
+     * [62] includeSect ::= '<![' S? 'INCLUDE' S? '['
+     *		extSubsetDecl ']]>'
+     * [63] ignoreSect ::= '<![' S? 'IGNORE' S? '['
+     *		ignoreSectContents* ']]>'
+     * [64] ignoreSectContents ::= Ignore
+     *		('<![' ignoreSectContents* ']]>' Ignore )*
+     * [65] Ignore ::= Char* - (Char* ( '<![' | ']]>') Char* )
+     * 
+ *

NOTE: the '>![' has already been read. + */ + private void parseConditionalSect () + throws Exception + { + skipWhitespace (); + if (tryRead ("INCLUDE")) { + skipWhitespace (); + require ('['); + skipWhitespace (); + while (!tryRead ("]]>")) { + parseMarkupdecl (); + skipWhitespace (); + } + } else if (tryRead ("IGNORE")) { + skipWhitespace (); + require ('['); + int nesting = 1; + char c; + expandPE = false; + for (int nest = 1; nest > 0;) { + c = readCh (); + switch (c) { + case '<': + if (tryRead ("![")) { + nest++; + } + case ']': + if (tryRead ("]>")) { + nest--; + } + } + } + expandPE = true; + } else { + error ("conditional section must begin with INCLUDE or IGNORE"); + } + } + + + /** + * Read and interpret a character reference. + *

+     * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
+     * 
+ *

NOTE: the '&#' has already been read. + */ + private void parseCharRef () + throws SAXException, IOException + { + int value = 0; + char c; + + if (tryRead ('x')) { +loop1: + while (true) { + c = readCh (); + switch (c) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + case 'a': + case 'A': + case 'b': + case 'B': + case 'c': + case 'C': + case 'd': + case 'D': + case 'e': + case 'E': + case 'f': + case 'F': + value *= 16; + value += Integer.parseInt (new Character (c).toString (), + 16); + break; + case ';': + break loop1; + default: + error ("illegal character in character reference", c, null); + break loop1; + } + } + } else { +loop2: + while (true) { + c = readCh (); + switch (c) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + value *= 10; + value += Integer.parseInt (new Character (c).toString (), + 10); + break; + case ';': + break loop2; + default: + error ("illegal character in character reference", c, null); + break loop2; + } + } + } + + // check for character refs being legal XML + if ((value < 0x0020 + && ! (value == '\n' || value == '\t' || value == '\r')) + || (value >= 0xD800 && value <= 0xDFFF) + || value == 0xFFFE || value == 0xFFFF + || value > 0x0010ffff) + error ("illegal XML character reference U+" + + Integer.toHexString (value)); + + // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz + // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz: + if (value <= 0x0000ffff) { + // no surrogates needed + dataBufferAppend ((char) value); + } else if (value <= 0x0010ffff) { + value -= 0x10000; + // > 16 bits, surrogate needed + dataBufferAppend ((char) (0xd800 | (value >> 10))); + dataBufferAppend ((char) (0xdc00 | (value & 0x0003ff))); + } else { + // too big for surrogate + error ("character reference " + value + " is too large for UTF-16", + new Integer (value).toString (), null); + } + } + + + /** + * Parse and expand an entity reference. + *

+     * [68] EntityRef ::= '&' Name ';'
+     * 
+ *

NOTE: the '&' has already been read. + * @param externalAllowed External entities are allowed here. + */ + private void parseEntityRef (boolean externalAllowed) + throws SAXException, IOException + { + String name; + + name = readNmtoken (true); + require (';'); +// + if (! translateExternalParsedEntities) { +// + switch (getEntityType (name)) { + case ENTITY_UNDECLARED: + error ("reference to undeclared entity", name, null); + break; + case ENTITY_INTERNAL: + System.err.println("Internal"); + pushString (name, getEntityValue (name)); + break; + case ENTITY_TEXT: + System.err.println("Text"); + if (externalAllowed) { + pushURL (name, getEntityPublicId (name), + getEntitySystemId (name), + null, null, null); + } else { + error ("reference to external entity in attribute value.", + name, null); + } + break; + case ENTITY_NDATA: + System.err.println("NDATA"); + if (externalAllowed) { + error ("unparsed entity reference in content", name, null); + } else { + error ("reference to external entity in attribute value.", + name, null); + } + break; + } +// + } else { + // + // Struct: updated to show the entity call + // + System.err.println("Entity reference : " + name); + handler.attribute ("xmlns:str", "http://4xt.org/ns/xmlstructure", false); + handler.attribute ("str:name", name, false); + handler.startElement ("str:entity"); + handler.endElement ("str:entity"); + + } + // + } + + + /** + * Parse and expand a parameter entity reference. + *

+     * [69] PEReference ::= '%' Name ';'
+     * 
+ *

NOTE: the '%' has already been read. + */ + private void parsePEReference () + throws SAXException, IOException + { + String name; + + name = "%" + readNmtoken (true); + require (';'); + switch (getEntityType (name)) { + case ENTITY_UNDECLARED: + // this is a validity problem, not a WFC violation ... but + // we should disable handling of all subsequent declarations + // unless this is a standalone document + // warn ("reference to undeclared parameter entity", name, null); + + break; + case ENTITY_INTERNAL: + if (inLiteral) + pushString (name, getEntityValue (name)); + else + pushString (name, " " + getEntityValue (name) + ' '); + break; + case ENTITY_TEXT: + if (!inLiteral) + pushString (null, " "); + pushURL (name, getEntityPublicId (name), + getEntitySystemId (name), + null, null, null); + if (!inLiteral) + pushString (null, " "); + break; + } + } + + /** + * Parse an entity declaration. + *

+     * [70] EntityDecl ::= GEDecl | PEDecl
+     * [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
+     * [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
+     * [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
+     * [74] PEDef ::= EntityValue | ExternalID
+     * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
+     *		   | 'PUBLIC' S PubidLiteral S SystemLiteral
+     * [76] NDataDecl ::= S 'NDATA' S Name
+     * 
+ *

NOTE: the '<!ENTITY' has already been read. + */ + private void parseEntityDecl () + throws Exception + { + char c; + boolean peFlag = false; + String name, value, notationName, ids[]; + + // Check for a parameter entity. + expandPE = false; + requireWhitespace (); + if (tryRead ('%')) { + peFlag = true; + requireWhitespace (); + } + expandPE = true; + + // Read the entity name, and prepend + // '%' if necessary. + name = readNmtoken (true); + if (peFlag) { + name = "%" + name; + } + + // Read the entity value. + requireWhitespace (); + c = readCh (); + unread (c); + if (c == '"' || c == '\'') { + // Internal entity ... replacement text has expanded refs + // to characters and PEs, but not to general entities + value = readLiteral (0); + setInternalEntity (name, value); + } else { + // Read the external IDs + ids = readExternalIds (false); + if (ids [1] == null) { + error ("system identifer missing", name, null); + } + + // Check for NDATA declaration. + boolean white = tryWhitespace (); + if (!peFlag && tryRead ("NDATA")) { + if (!white) + error ("whitespace required before NDATA"); + requireWhitespace (); + notationName = readNmtoken (true); + setExternalDataEntity (name, ids [0], ids [1], notationName); + } else { + +// + if (showDocStructure) { + + // + // Struct: updated to show the external entities definitions + // + + handler.attribute ("str:name", name, false); + handler.attribute ("str:type", ids [0], false); + handler.attribute ("str:systemId", ids [1], false); + handler.startElement ("str:externalEntityDefinition"); + handler.endElement ("str:externalEntityDefinition"); + + } +// + + + setExternalTextEntity (name, ids [0], ids [1]); + } + } + + // Finish the declaration. + skipWhitespace (); + require ('>'); + } + + + /** + * Parse a notation declaration. + *

+     * [82] NotationDecl ::= '<!NOTATION' S Name S
+     *		(ExternalID | PublicID) S? '>'
+     * [83] PublicID ::= 'PUBLIC' S PubidLiteral
+     * 
+ *

NOTE: the '<!NOTATION' has already been read. + */ + private void parseNotationDecl () + throws Exception + { + String nname, ids[]; + + + requireWhitespace (); + nname = readNmtoken (true); + + requireWhitespace (); + + // Read the external identifiers. + ids = readExternalIds (true); + if (ids [0] == null && ids [1] == null) { + error ("external identifer missing", nname, null); + } + + // Register the notation. + setNotation (nname, ids [0], ids [1]); + + skipWhitespace (); + require ('>'); + } + + + /** + * Parse character data. + *

+     * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
+     * 
+ */ + private void parseCharData () + throws Exception + { + char c; + + // Start with a little cheat -- in most + // cases, the entire sequence of + // character data will already be in + // the readBuffer; if not, fall through to + // the normal approach. + if (USE_CHEATS) { + int lineAugment = 0; + int columnAugment = 0; + +loop: + for (int i = readBufferPos; i < readBufferLength; i++) { + switch (c = readBuffer [i]) { + case '\n': + lineAugment++; + columnAugment = 0; + break; + case '&': + case '<': + int start = readBufferPos; + columnAugment++; + readBufferPos = i; + if (lineAugment > 0) { + line += lineAugment; + column = columnAugment; + } else { + column += columnAugment; + } + dataBufferAppend (readBuffer, start, i - start); + return; + case ']': + // XXX missing two end-of-buffer cases + if ((i + 2) < readBufferLength) { + if (readBuffer [i + 1] == ']' + && readBuffer [i + 2] == '>') { + error ("character data may not contain ']]>'"); + } + } + columnAugment++; + break; + default: + if (c < 0x0020 || c > 0xFFFD) + error ("illegal XML character U+" + + Integer.toHexString (c)); + // FALLTHROUGH + case '\r': + case '\t': + columnAugment++; + } + } + } + + // OK, the cheat didn't work; start over + // and do it by the book. + while (true) { + c = readCh (); + switch (c) { + case '<': + case '&': + unread (c); + return; + // XXX "]]>" precluded ... + default: + dataBufferAppend (c); + break; + } + } + } + + + ////////////////////////////////////////////////////////////////////// + // High-level reading and scanning methods. + ////////////////////////////////////////////////////////////////////// + + /** + * Require whitespace characters. + */ + private void requireWhitespace () + throws SAXException, IOException + { + char c = readCh (); + if (isWhitespace (c)) { + skipWhitespace (); + } else { + error ("whitespace required", c, null); + } + } + + + /** + * Parse whitespace characters, and leave them in the data buffer. + */ + private void parseWhitespace () + throws Exception + { + char c = readCh (); + while (isWhitespace (c)) { + dataBufferAppend (c); + c = readCh (); + } + unread (c); + } + + + /** + * Skip whitespace characters. + *
+     * [3] S ::= (#x20 | #x9 | #xd | #xa)+
+     * 
+ */ + private void skipWhitespace () + throws SAXException, IOException + { + // Start with a little cheat. Most of + // the time, the white space will fall + // within the current read buffer; if + // not, then fall through. + if (USE_CHEATS) { + int lineAugment = 0; + int columnAugment = 0; + +loop: + for (int i = readBufferPos; i < readBufferLength; i++) { + switch (readBuffer [i]) { + case ' ': + case '\t': + case '\r': + columnAugment++; + break; + case '\n': + lineAugment++; + columnAugment = 0; + break; + case '%': + if (expandPE) + break loop; + // else fall through... + default: + readBufferPos = i; + if (lineAugment > 0) { + line += lineAugment; + column = columnAugment; + } else { + column += columnAugment; + } + return; + } + } + } + + // OK, do it by the book. + char c = readCh (); + while (isWhitespace (c)) { + c = readCh (); + } + unread (c); + } + + + /** + * Read a name or (when parsing an enumeration) name token. + *
+     * [5] Name ::= (Letter | '_' | ':') (NameChar)*
+     * [7] Nmtoken ::= (NameChar)+
+     * 
+ */ + private String readNmtoken (boolean isName) + throws SAXException, IOException + { + char c; + + if (USE_CHEATS) { +loop: + for (int i = readBufferPos; i < readBufferLength; i++) { + c = readBuffer [i]; + switch (c) { + case '%': + if (expandPE) + break loop; + // else fall through... + + // What may legitimately come AFTER a name/nmtoken? + case '<': case '>': case '&': + case ',': case '|': case '*': case '+': case '?': + case ')': + case '=': + case '\'': case '"': + case '[': + case ' ': case '\t': case '\r': case '\n': + case ';': + case '/': + int start = readBufferPos; + if (i == start) + error ("name expected", readBuffer [i], null); + readBufferPos = i; + return intern (readBuffer, start, i - start); + + default: + // punt on exact tests from Appendix A; approximate + // them using the Unicode ID start/part rules + if (i == readBufferPos && isName) { + if (!Character.isUnicodeIdentifierStart (c) + && c != ':' && c != '_') + error ("Not a name start character, U+" + + Integer.toHexString (c)); + } else if (!Character.isUnicodeIdentifierPart (c) + && c != '-' && c != ':' && c != '_' && c != '.' + && !isExtender (c)) + error ("Not a name character, U+" + + Integer.toHexString (c)); + } + } + } + + nameBufferPos = 0; + + // Read the first character. +loop: + while (true) { + c = readCh (); + switch (c) { + case '%': + case '<': case '>': case '&': + case ',': case '|': case '*': case '+': case '?': + case ')': + case '=': + case '\'': case '"': + case '[': + case ' ': case '\t': case '\n': case '\r': + case ';': + case '/': + unread (c); + if (nameBufferPos == 0) { + error ("name expected"); + } + // punt on exact tests from Appendix A, but approximate them + if (isName + && !Character.isUnicodeIdentifierStart ( + nameBuffer [0]) + && ":_".indexOf (nameBuffer [0]) == -1) + error ("Not a name start character, U+" + + Integer.toHexString (nameBuffer [0])); + String s = intern (nameBuffer, 0, nameBufferPos); + nameBufferPos = 0; + return s; + default: + // punt on exact tests from Appendix A, but approximate them + + if ((nameBufferPos != 0 || !isName) + && !Character.isUnicodeIdentifierPart (c) + && ":-_.".indexOf (c) == -1 + && !isExtender (c)) + error ("Not a name character, U+" + + Integer.toHexString (c)); + if (nameBufferPos >= nameBuffer.length) + nameBuffer = + (char[]) extendArray (nameBuffer, + nameBuffer.length, nameBufferPos); + nameBuffer [nameBufferPos++] = c; + } + } + } + + private static boolean isExtender (char c) + { + // [88] Extender ::= ... + return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387 + || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005 + || (c >= 0x3031 && c <= 0x3035) + || (c >= 0x309d && c <= 0x309e) + || (c >= 0x30fc && c <= 0x30fe); + } + + + /** + * Read a literal. With matching single or double quotes as + * delimiters (and not embedded!) this is used to parse: + *
+     *	[9] EntityValue ::= ... ([^%&] | PEReference | Reference)* ...
+     *	[10] AttValue ::= ... ([^<&] | Reference)* ...
+     *	[11] SystemLiteral ::= ... (URLchar - "'")* ...
+     *	[12] PubidLiteral ::= ... (PubidChar - "'")* ...
+     * 
+ * as well as the quoted strings in XML and text declarations + * (for version, encoding, and standalone) which have their + * own constraints. + */ + private String readLiteral (int flags) + throws SAXException, IOException + { + char delim, c; + int startLine = line; + boolean saved = expandPE; + + // Find the first delimiter. + delim = readCh (); + if (delim != '"' && delim != '\'' && delim != (char) 0) { + error ("expected '\"' or \"'\"", delim, null); + return null; + } + inLiteral = true; + if ((flags & LIT_DISABLE_PE) != 0) + expandPE = false; + + // Each level of input source has its own buffer; remember + // ours, so we won't read the ending delimiter from any + // other input source, regardless of entity processing. + char ourBuf [] = readBuffer; + + // Read the literal. + try { + c = readCh (); +loop: + while (! (c == delim && readBuffer == ourBuf)) { + switch (c) { + // Can't escape this normalization for attributes + case '\n': + case '\r': + case '\t': + if ((flags & LIT_ATTRIBUTE) != 0) + c = ' '; + break; + case '&': + c = readCh (); + // Char refs are expanded immediately, except for + // all the cases where it's deferred. + if (c == '#') { + if ((flags & LIT_DISABLE_CREF) != 0) { + dataBufferAppend ('&'); + dataBufferAppend ('#'); + continue; + } + parseCharRef (); + + // It looks like an entity ref ... + } else { + unread (c); + // Expand it? + if ((flags & LIT_ENTITY_REF) > 0) { + parseEntityRef (false); + + // Is it just data? + } else if ((flags & LIT_DISABLE_EREF) != 0) { + dataBufferAppend ('&'); + + // OK, it will be an entity ref -- expanded later. + } else { + String name = readNmtoken (true); + require (';'); + if ((flags & LIT_ENTITY_CHECK) != 0 + && getEntityType (name) == + ENTITY_UNDECLARED) { + error ("General entity '" + name + + "' must be declared before use"); + } + dataBufferAppend ('&'); + dataBufferAppend (name); + dataBufferAppend (';'); + } + } + c = readCh (); + continue loop; + + case '<': + // and why? Perhaps so "&foo;" expands the same + // inside and outside an attribute? + if ((flags & LIT_ATTRIBUTE) != 0) + error ("attribute values may not contain '<'"); + break; + + // We don't worry about case '%' and PE refs, readCh does. + + default: + break; + } + dataBufferAppend (c); + c = readCh (); + } + } catch (EOFException e) { + error ("end of input while looking for delimiter (started on line " + + startLine + ')', null, new Character (delim).toString ()); + } + inLiteral = false; + expandPE = saved; + + // Normalise whitespace if necessary. + if ((flags & LIT_NORMALIZE) > 0) { + dataBufferNormalize (); + } + + // Return the value. + return dataBufferToString (); + } + + + /** + * Try reading external identifiers. + * A system identifier is not required for notations. + * @param inNotation Are we in a notation? + * @return A two-member String array containing the identifiers. + */ + private String[] readExternalIds (boolean inNotation) + throws Exception + { + char c; + String ids[] = new String [2]; + int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; + + if (tryRead ("PUBLIC")) { + requireWhitespace (); + ids [0] = readLiteral (LIT_NORMALIZE | flags); + if (inNotation) { + skipWhitespace (); + c = readCh (); + unread (c); + if (c == '"' || c == '\'') { + ids [1] = readLiteral (flags); + } + } else { + requireWhitespace (); + ids [1] = readLiteral (flags); + } + + for (int i = 0; i < ids [0].length (); i++) { + c = ids [0].charAt (i); + if (c >= 'a' && c <= 'z') + continue; + if (c >= 'A' && c <= 'Z') + continue; + if (" \r\n0123456789-' ()+,./:=?;!*#@$_%".indexOf (c) != -1) + continue; + error ("illegal PUBLIC id character U+" + + Integer.toHexString (c)); + } + } else if (tryRead ("SYSTEM")) { + requireWhitespace (); + ids [1] = readLiteral (flags); + } + + // XXX should normalize system IDs as follows: + // - Convert to UTF-8 + // - Map reserved and non-ASCII characters to %HH + + return ids; + } + + + /** + * Test if a character is whitespace. + *
+     * [3] S ::= (#x20 | #x9 | #xd | #xa)+
+     * 
+ * @param c The character to test. + * @return true if the character is whitespace. + */ + private final boolean isWhitespace (char c) + { + if (c > 0x20) + return false; + if (c == 0x20 || c == 0x0a || c == 0x09 || c == 0x0d) + return true; + return false; // illegal ... + } + + + ////////////////////////////////////////////////////////////////////// + // Utility routines. + ////////////////////////////////////////////////////////////////////// + + + /** + * Add a character to the data buffer. + */ + private void dataBufferAppend (char c) + { + // Expand buffer if necessary. + if (dataBufferPos >= dataBuffer.length) + dataBuffer = + (char[]) extendArray (dataBuffer, + dataBuffer.length, dataBufferPos); + dataBuffer [dataBufferPos++] = c; + } + + + /** + * Add a string to the data buffer. + */ + private void dataBufferAppend (String s) + { + dataBufferAppend (s.toCharArray (), 0, s.length ()); + } + + + /** + * Append (part of) a character array to the data buffer. + */ + private void dataBufferAppend (char ch[], int start, int length) + { + dataBuffer = (char[]) + extendArray (dataBuffer, dataBuffer.length, + dataBufferPos + length); + + System.arraycopy (ch, start, dataBuffer, dataBufferPos, length); + dataBufferPos += length; + } + + + /** + * Normalise whitespace in the data buffer. + */ + private void dataBufferNormalize () + { + int i = 0; + int j = 0; + int end = dataBufferPos; + + // Skip whitespace at the start. + while (j < end && isWhitespace (dataBuffer [j])) { + j++; + } + + // Skip whitespace at the end. + while (end > j && isWhitespace (dataBuffer [end - 1])) { + end --; + } + + // Start copying to the left. + while (j < end) { + + char c = dataBuffer [j++]; + + // Normalise all other whitespace to + // a single space. + if (isWhitespace (c)) { + while (j < end && isWhitespace (dataBuffer [j++])) {} + + dataBuffer [i++] = ' '; + dataBuffer [i++] = dataBuffer [j - 1]; + } else { + dataBuffer [i++] = c; + } + } + + // The new length is <= the old one. + dataBufferPos = i; + } + + + /** + * Convert the data buffer to a string. + */ + private String dataBufferToString () + { + String s = new String (dataBuffer, 0, dataBufferPos); + dataBufferPos = 0; + return s; + } + + + /** + * Flush the contents of the data buffer to the handler, as + * appropriate, and reset the buffer for new input. + */ + private void dataBufferFlush () + throws SAXException + { + if (currentElementContent == CONTENT_ELEMENTS + && dataBufferPos > 0 + && !inCDATA + ) { + // We can't just trust the buffer to be whitespace, there + // are cases when it isn't + for (int i = 0; i < dataBufferPos; i++) { + if (!isWhitespace (dataBuffer [i])) { + handler.charData (dataBuffer, 0, dataBufferPos); + dataBufferPos = 0; + } + } + if (dataBufferPos > 0) { + handler.ignorableWhitespace (dataBuffer, 0, dataBufferPos); + dataBufferPos = 0; + } + } else if (dataBufferPos > 0) { + handler.charData (dataBuffer, 0, dataBufferPos); + dataBufferPos = 0; + } + } + + + /** + * Require a string to appear, or throw an exception. + *

Precondition: Entity expansion is not required. + *

Precondition: data buffer has no characters that + * will get sent to the application. + */ + private void require (String delim) + throws SAXException, IOException + { + int length = delim.length (); + char ch []; + + if (length < dataBuffer.length) { + ch = dataBuffer; + delim.getChars (0, length, ch, 0); + } else + ch = delim.toCharArray (); + + if (USE_CHEATS + && length <= (readBufferLength - readBufferPos)) { + int offset = readBufferPos; + + for (int i = 0; i < length; i++, offset++) + if (ch [i] != readBuffer [offset]) + error ("required string", null, delim); + readBufferPos = offset; + + } else { + for (int i = 0; i < length; i++) + require (ch [i]); + } + } + + + /** + * Require a character to appear, or throw an exception. + */ + private void require (char delim) + throws SAXException, IOException + { + char c = readCh (); + + if (c != delim) { + error ("required character", c, new Character (delim).toString ()); + } + } + + + /** + * Create an interned string from a character array. + * Ælfred uses this method to create an interned version + * of all names and name tokens, so that it can test equality + * with == instead of String.equals (). + * + *

This is much more efficient than constructing a non-interned + * string first, and then interning it. + * + * @param ch an array of characters for building the string. + * @param start the starting position in the array. + * @param length the number of characters to place in the string. + * @return an interned string. + * @see #intern (String) + * @see java.lang.String#intern + */ + public String intern (char ch[], int start, int length) + { + int index = 0; + int hash = 0; + Object bucket []; + + // Generate a hash code. + for (int i = start; i < start + length; i++) + hash = 31 * hash + ch [i]; + hash = (hash & 0x7fffffff) % SYMBOL_TABLE_LENGTH; + + // Get the bucket -- consists of {array,String} pairs + if ((bucket = symbolTable [hash]) == null) { + // first string in this bucket + bucket = new Object [8]; + + // Search for a matching tuple, and + // return the string if we find one. + } else { + while (index < bucket.length) { + char chFound [] = (char []) bucket [index]; + + // Stop when we hit a null index. + if (chFound == null) + break; + + // If they're the same length, check for a match. + if (chFound.length == length) { + for (int i = 0; i < chFound.length; i++) { + // continue search on failure + if (ch [start + i] != chFound [i]) { + break; + } else if (i == length - 1) { + // That's it, we have a match! + return (String) bucket [index + 1]; + } + } + } + index += 2; + } + // Not found -- we'll have to add it. + + // Do we have to grow the bucket? + bucket = (Object []) extendArray (bucket, bucket.length, index); + } + symbolTable [hash] = bucket; + + // OK, add it to the end of the bucket -- "local" interning. + // Intern "globally" to let applications share interning benefits. + String s = new String (ch, start, length).intern (); + bucket [index] = s.toCharArray (); + bucket [index + 1] = s; + return s; + } + + + /** + * Ensure the capacity of an array, allocating a new one if + * necessary. Usually called only a handful of times. + */ + private Object extendArray (Object array, int currentSize, int requiredSize) + { + if (requiredSize < currentSize) { + return array; + } else { + Object newArray = null; + int newSize = currentSize * 2; + + if (newSize <= requiredSize) + newSize = requiredSize + 1; + + if (array instanceof char[]) + newArray = new char [newSize]; + else if (array instanceof Object[]) + newArray = new Object [newSize]; + else + throw new RuntimeException (); + + System.arraycopy (array, 0, newArray, 0, currentSize); + return newArray; + } + } + + + ////////////////////////////////////////////////////////////////////// + // XML query routines. + ////////////////////////////////////////////////////////////////////// + + + // + // Elements + // + + /** + * Get the declared elements for an XML document. + *

The results will be valid only after the DTD (if any) has been + * parsed. + * @return An enumeration of all element types declared for this + * document (as Strings). + * @see #getElementContentType + * @see #getElementContentModel + */ + public Enumeration declaredElements () + { + return elementInfo.keys (); + } + + + private int getContentType (Object element [], int defaultType) + { + int retval; + + if (element == null) + return defaultType; + retval = ((Integer) element [0]).intValue (); + if (retval == CONTENT_UNDECLARED) + retval = defaultType; + return retval; + } + + + /** + * Look up the content type of an element. + * @param name The element type name. + * @return An integer constant representing the content type. + * @see #getElementContentModel + * @see #CONTENT_UNDECLARED + * @see #CONTENT_ANY + * @see #CONTENT_EMPTY + * @see #CONTENT_MIXED + * @see #CONTENT_ELEMENTS + */ + public int getElementContentType (String name) + { + Object element [] = (Object []) elementInfo.get (name); + return getContentType (element, CONTENT_UNDECLARED); + } + + + /** + * Look up the content model of an element. + *

The result will always be null unless the content type is + * CONTENT_ELEMENTS or CONTENT_MIXED. + * @param name The element type name. + * @return The normalised content model, as a string. + * @see #getElementContentType + */ + public String getElementContentModel (String name) + { + Object element[] = (Object[]) elementInfo.get (name); + if (element == null) { + return null; + } else { + return (String) element [1]; + } + } + + + /** + * Register an element. + * Array format: + * [0] element type name + * [1] content model (mixed, elements only) + * [2] attribute hash table + */ + private Object [] + setElement (String name, int contentType, String contentModel) + { + Object element [] = (Object []) elementInfo.get (name); + + // first or for this type + if (element == null) { + element = new Object [3]; + element [0] = new Integer (contentType); + element [1] = contentModel; + element [2] = new Hashtable (DEFAULT_ATTR_COUNT); + elementInfo.put (name, element); + return element; + } + + // multiple declarations + if (((Integer) element [0]).intValue () != CONTENT_UNDECLARED) { + // warn ("multiple declarations for element type", name, null); + return element; + } + + // after associated + element [0] = new Integer (contentType); + element [1] = contentModel; + + return element; + } + + + /** + * Look up the attribute hash table for an element. + * The hash table is the second item in the element array. + */ + private Hashtable getElementAttributes (String name) + { + Object element[] = (Object[]) elementInfo.get (name); + + if (element == null) + element = setElement (name, CONTENT_UNDECLARED, null); + return (Hashtable) element [2]; + } + + + + // + // Attributes + // + + /** + * Get the declared attributes for an element type. + * @param elname The name of the element type. + * @return An Enumeration of all the attributes declared for + * a specific element type. The results will be valid only + * after the DTD (if any) has been parsed. + * @see #getAttributeType + * @see #getAttributeEnumeration + * @see #getAttributeDefaultValueType + * @see #getAttributeDefaultValue + * @see #getAttributeExpandedValue + */ + private Enumeration declaredAttributes (Object element []) + { + Hashtable attlist; + + if (element == null) + return null; + if ((attlist = (Hashtable) element [2]) == null) + return null; + return attlist.keys (); + } + + /** + * Get the declared attributes for an element type. + * @param elname The name of the element type. + * @return An Enumeration of all the attributes declared for + * a specific element type. The results will be valid only + * after the DTD (if any) has been parsed. + * @see #getAttributeType + * @see #getAttributeEnumeration + * @see #getAttributeDefaultValueType + * @see #getAttributeDefaultValue + * @see #getAttributeExpandedValue + */ + public Enumeration declaredAttributes (String elname) + { + return declaredAttributes ((Object []) elementInfo.get (elname)); + } + + + /** + * Retrieve the declared type of an attribute. + * @param name The name of the associated element. + * @param aname The name of the attribute. + * @return An integer constant representing the attribute type. + * @see #ATTRIBUTE_UNDECLARED + * @see #ATTRIBUTE_CDATA + * @see #ATTRIBUTE_ID + * @see #ATTRIBUTE_IDREF + * @see #ATTRIBUTE_IDREFS + * @see #ATTRIBUTE_ENTITY + * @see #ATTRIBUTE_ENTITIES + * @see #ATTRIBUTE_NMTOKEN + * @see #ATTRIBUTE_NMTOKENS + * @see #ATTRIBUTE_ENUMERATED + * @see #ATTRIBUTE_NOTATION + */ + public int getAttributeType (String name, String aname) + { + Object attribute[] = getAttribute (name, aname); + if (attribute == null) { + return ATTRIBUTE_UNDECLARED; + } else { + return ((Integer) attribute [0]).intValue (); + } + } + + + /** + * Retrieve the allowed values for an enumerated attribute type. + * @param name The name of the associated element. + * @param aname The name of the attribute. + * @return A string containing the token list. + * @see #ATTRIBUTE_ENUMERATED + * @see #ATTRIBUTE_NOTATION + */ + public String getAttributeEnumeration (String name, String aname) + { + Object attribute[] = getAttribute (name, aname); + if (attribute == null) { + return null; + } else { + return (String) attribute [3]; + } + } + + + /** + * Retrieve the default value of a declared attribute. + * @param name The name of the associated element. + * @param aname The name of the attribute. + * @return The default value, or null if the attribute was + * #IMPLIED or simply undeclared and unspecified. + * @see #getAttributeExpandedValue + */ + public String getAttributeDefaultValue (String name, String aname) + { + Object attribute[] = getAttribute (name, aname); + if (attribute == null) { + return null; + } else { + return (String) attribute [1]; + } + } + + + /** + * Retrieve the expanded value of a declared attribute. + *

General entities will be expanded (once). + * @param name The name of the associated element. + * @param aname The name of the attribute. + * @return The expanded default value, or null if the attribute was + * #IMPLIED or simply undeclared + * @see #getAttributeDefaultValue + */ + public String getAttributeExpandedValue (String name, String aname) + throws Exception + { + Object attribute[] = getAttribute (name, aname); + + if (attribute == null) { + return null; + } else if (attribute [4] == null && attribute [1] != null) { + // we MUST use the same buf for both quotes else the literal + // can't be properly terminated + char buf [] = new char [1]; + int flags = LIT_ENTITY_REF | LIT_ATTRIBUTE; + int type = getAttributeType (name, aname); + + if (type != ATTRIBUTE_CDATA && type != ATTRIBUTE_UNDECLARED) + flags |= LIT_NORMALIZE; + buf [0] = '"'; + pushCharArray (null, buf, 0, 1); + pushString (null, (String) attribute [1]); + pushCharArray (null, buf, 0, 1); + attribute [4] = readLiteral (flags); + } + return (String) attribute [4]; + } + + + /** + * Retrieve the default value type of a declared attribute. + * @see #ATTRIBUTE_DEFAULT_SPECIFIED + * @see #ATTRIBUTE_DEFAULT_IMPLIED + * @see #ATTRIBUTE_DEFAULT_REQUIRED + * @see #ATTRIBUTE_DEFAULT_FIXED + */ + public int getAttributeDefaultValueType (String name, String aname) + { + Object attribute[] = getAttribute (name, aname); + if (attribute == null) { + return ATTRIBUTE_DEFAULT_UNDECLARED; + } else { + return ((Integer) attribute [2]).intValue (); + } + } + + + /** + * Register an attribute declaration for later retrieval. + * Format: + * - String type + * - String default value + * - int value type + */ + private void setAttribute (String elName, String name, int type, + String enumeration, + String value, int valueType) + throws Exception + { + Hashtable attlist; + Object attribute[]; + + // Create a new hashtable if necessary. + attlist = getElementAttributes (elName); + + // ignore multiple attribute declarations! + if (attlist.get (name) != null) { + // warn ... + return; + } else { + attribute = new Object [5]; + attribute [0] = new Integer (type); + attribute [1] = value; + attribute [2] = new Integer (valueType); + attribute [3] = enumeration; + attribute [4] = null; + attlist.put (name, attribute); + } + } + + + /** + * Retrieve the three-member array representing an + * attribute declaration. + */ + private Object[] getAttribute (String elName, String name) + { + Hashtable attlist; + Object attribute[]; + + attlist = getElementAttributes (elName); + if (attlist == null) { + return null; + } + + attribute = (Object[]) attlist.get (name); + return attribute; + } + + + // + // Entities + // + + /** + * Get declared entities. + * @return An Enumeration of all the entities declared for + * this XML document. The results will be valid only + * after the DTD (if any) has been parsed. + * @see #getEntityType + * @see #getEntityPublicId + * @see #getEntitySystemId + * @see #getEntityValue + * @see #getEntityNotationName + */ + public Enumeration declaredEntities () + { + return entityInfo.keys (); + } + + + /** + * Find the type of an entity. + * @returns An integer constant representing the entity type. + * @see #ENTITY_UNDECLARED + * @see #ENTITY_INTERNAL + * @see #ENTITY_NDATA + * @see #ENTITY_TEXT + */ + public int getEntityType (String ename) + { + Object entity[] = (Object[]) entityInfo.get (ename); + if (entity == null) { + return ENTITY_UNDECLARED; + } else { + return ((Integer) entity [0]).intValue (); + } + } + + + /** + * Return an external entity's public identifier, if any. + * @param ename The name of the external entity. + * @return The entity's system identifier, or null if the + * entity was not declared, if it is not an + * external entity, or if no public identifier was + * provided. + * @see #getEntityType + */ + public String getEntityPublicId (String ename) + { + Object entity[] = (Object[]) entityInfo.get (ename); + if (entity == null) { + return null; + } else { + return (String) entity [1]; + } + } + + + /** + * Return an external entity's system identifier. + * @param ename The name of the external entity. + * @return The entity's system identifier, or null if the + * entity was not declared, or if it is not an + * external entity. + * @see #getEntityType + */ + public String getEntitySystemId (String ename) + { + Object entity[] = (Object[]) entityInfo.get (ename); + if (entity == null) { + return null; + } else { + return (String) entity [2]; + } + } + + + /** + * Return the value of an internal entity. + * @param ename The name of the internal entity. + * @return The entity's value, or null if the entity was + * not declared, or if it is not an internal entity. + * @see #getEntityType + */ + public String getEntityValue (String ename) + { + Object entity[] = (Object[]) entityInfo.get (ename); + if (entity == null) { + return null; + } else { + return (String) entity [3]; + } + } + + + /** + * Get the notation name associated with an NDATA entity. + * @param ename The NDATA entity name. + * @return The associated notation name, or null if the + * entity was not declared, or if it is not an + * NDATA entity. + * @see #getEntityType + */ + public String getEntityNotationName (String eName) + { + Object entity[] = (Object[]) entityInfo.get (eName); + if (entity == null) { + return null; + } else { + return (String) entity [4]; + } + } + + + /** + * Register an entity declaration for later retrieval. + */ + private void setInternalEntity (String eName, String value) + { + setEntity (eName, ENTITY_INTERNAL, null, null, value, null); + } + + + /** + * Register an external data entity. + */ + private void setExternalDataEntity (String eName, String pubid, + String sysid, String nName) + { + setEntity (eName, ENTITY_NDATA, pubid, sysid, null, nName); + } + + + /** + * Register an external text entity. + */ + private void setExternalTextEntity (String eName, + String pubid, String sysid) + { + setEntity (eName, ENTITY_TEXT, pubid, sysid, null, null); + } + + + /** + * Register an entity declaration for later retrieval. + */ + private void setEntity (String eName, int eClass, + String pubid, String sysid, + String value, String nName) + { + Object entity[]; + + if (entityInfo.get (eName) == null) { + entity = new Object [5]; + entity [0] = new Integer (eClass); + entity [1] = pubid; + entity [2] = sysid; + entity [3] = value; + entity [4] = nName; + + entityInfo.put (eName, entity); + } + } + + + // + // Notations. + // + + /** + * Get declared notations. + * @return An Enumeration of all the notations declared for + * this XML document. The results will be valid only + * after the DTD (if any) has been parsed. + * @see #getNotationPublicId + * @see #getNotationSystemId + */ + public Enumeration declaredNotations () + { + return notationInfo.keys (); + } + + + /** + * Look up the public identifier for a notation. + * You will normally use this method to look up a notation + * that was provided as an attribute value or for an NDATA entity. + * @param nname The name of the notation. + * @return A string containing the public identifier, or null + * if none was provided or if no such notation was + * declared. + * @see #getNotationSystemId + */ + public String getNotationPublicId (String nname) + { + Object notation[] = (Object[]) notationInfo.get (nname); + if (notation == null) { + return null; + } else { + return (String) notation [0]; + } + } + + + /** + * Look up the system identifier for a notation. + * You will normally use this method to look up a notation + * that was provided as an attribute value or for an NDATA entity. + * @param nname The name of the notation. + * @return A string containing the system identifier, or null + * if no such notation was declared. + * @see #getNotationPublicId + */ + public String getNotationSystemId (String nname) + { + Object notation[] = (Object[]) notationInfo.get (nname); + if (notation == null) { + return null; + } else { + return (String) notation [1]; + } + } + + + /** + * Register a notation declaration for later retrieval. + * Format: + * - public id + * - system id + */ + private void setNotation (String nname, String pubid, String sysid) + throws Exception + { + Object notation[]; + + if (notationInfo.get (nname) == null) { + notation = new Object [2]; + notation [0] = pubid; + notation [1] = sysid; + notationInfo.put (nname, notation); + } else { + // VC: Unique Notation Name + // (it's not fatal) + } + } + + + // + // Location. + // + + + /** + * Return the current line number. + */ + public int getLineNumber () + { + return line; + } + + + /** + * Return the current column number. + */ + public int getColumnNumber () + { + return column; + } + + + ////////////////////////////////////////////////////////////////////// + // High-level I/O. + ////////////////////////////////////////////////////////////////////// + + + /** + * Read a single character from the readBuffer. + *

The readDataChunk () method maintains the buffer. + *

If we hit the end of an entity, try to pop the stack and + * keep going. + *

(This approach doesn't really enforce XML's rules about + * entity boundaries, but this is not currently a validating + * parser). + *

This routine also attempts to keep track of the current + * position in external entities, but it's not entirely accurate. + * @return The next available input character. + * @see #unread (char) + * @see #unread (String) + * @see #readDataChunk + * @see #readBuffer + * @see #line + * @return The next character from the current input source. + */ + private char readCh () + throws SAXException, IOException + { + char c; + + // As long as there's nothing in the + // read buffer, try reading more data + // (for an external entity) or popping + // the entity stack (for either). + while (readBufferPos >= readBufferLength) { + switch (sourceType) { + case INPUT_READER: + case INPUT_EXTERNAL: + case INPUT_STREAM: + readDataChunk (); + while (readBufferLength < 1) { + popInput (); + if (readBufferLength < 1) { + readDataChunk (); + } + } + break; + + default: + + popInput (); + break; + } + } + + c = readBuffer [readBufferPos++]; + + if (c == '\n') { + line++; + column = 0; + } else { + if (c == '<') + /* favorite return to parseContent () .. NOP */ ; + else if ((c < 0x0020 && (c != '\t') && (c != '\r')) || c > 0xFFFD) + error ("illegal XML character U+" + + Integer.toHexString (c)); + + // If we're in the DTD and in a context where PEs get expanded, + // do so ... 1/14/2000 errata identify those contexts. There + // are also spots in the internal subset where PE refs are fatal + // errors, hence yet another flag. + else if (c == '%' && expandPE) { + if (peIsError) + error ("PE reference within decl in internal subset."); + parsePEReference (); + return readCh (); + } + column++; + } + + return c; + } + + + /** + * Push a single character back onto the current input stream. + *

This method usually pushes the character back onto + * the readBuffer, while the unread (String) method treats the + * string as a new internal entity. + *

I don't think that this would ever be called with + * readBufferPos = 0, because the methods always reads a character + * before unreading it, but just in case, I've added a boundary + * condition. + * @param c The character to push back. + * @see #readCh + * @see #unread (String) + * @see #unread (char[]) + * @see #readBuffer + */ + private void unread (char c) + throws SAXException + { + // Normal condition. + if (c == '\n') { + line--; + column = -1; + } + if (readBufferPos > 0) { + readBuffer [--readBufferPos] = c; + } else { + pushString (null, new Character (c).toString ()); + } + } + + + /** + * Push a char array back onto the current input stream. + *

NOTE: you must never push back characters that you + * haven't actually read: use pushString () instead. + * @see #readCh + * @see #unread (char) + * @see #unread (String) + * @see #readBuffer + * @see #pushString + */ + private void unread (char ch[], int length) + throws SAXException + { + for (int i = 0; i < length; i++) { + if (ch [i] == '\n') { + line--; + column = -1; + } + } + if (length < readBufferPos) { + readBufferPos -= length; + } else { + pushCharArray (null, ch, 0, length); + sourceType = INPUT_BUFFER; + } + } + + + /** + * Push a new external input source. + * The source will be some kind of parsed entity, such as a PE + * (including the external DTD subset) or content for the body. + *

TODO: Right now, this method always attempts to autodetect + * the encoding; in the future, it should allow the caller to + * request an encoding explicitly, and it should also look at the + * headers with an HTTP connection. + * @param url The java.net.URL object for the entity. + * @see StructSaxDriver#resolveEntity + * @see #pushString + * @see #sourceType + * @see #pushInput + * @see #detectEncoding + * @see #sourceType + * @see #readBuffer + */ + private void pushURL ( + String ename, + String publicId, + String systemId, + Reader reader, + InputStream stream, + String encoding + ) throws SAXException, IOException + { + URL url; + boolean ignoreEncoding = false; + + // Push the existing status. + pushInput (ename); + + // Create a new read buffer. + // (Note the four-character margin) + readBuffer = new char [READ_BUFFER_MAX + 4]; + readBufferPos = 0; + readBufferLength = 0; + readBufferOverflow = -1; + is = null; + line = 1; + + currentByteCount = 0; + + // Make any system ID (URI/URL) absolute. There's one case + // where it may be null: parser was invoked without providing + // one, e.g. since the XML data came from a memory buffer. + + if (systemId != null && externalEntity != null) { + systemId = new URL (externalEntity.getURL (), systemId).toString (); + } else if (baseURI != null) { + systemId = new URL (new URL (baseURI), systemId).toString (); + // throws IOException if couldn't create new URL + } + + // See if the application wants to + // redirect the system ID and/or + // supply its own character stream. + if (reader == null && stream == null && systemId != null) { + Object input = handler.resolveEntity (publicId, systemId); + if (input != null) { + if (input instanceof String) { + systemId = (String) input; + } else if (input instanceof InputStream) { + stream = (InputStream) input; + } else if (input instanceof Reader) { + reader = (Reader) input; + } + } + } + + // Start the entity. + if (systemId != null) { + handler.startExternalEntity (systemId); + } else { + handler.startExternalEntity ("[unidentified data stream]"); + } + + // If there's an explicit character stream, just + // ignore encoding declarations. + if (reader != null) { + sourceType = INPUT_READER; + this.reader = reader; + tryEncodingDecl (true); + return; + } + + // Else we handle the conversion, and need to ensure + // it's done right. + if (stream != null) { + sourceType = INPUT_STREAM; + is = stream; + url = null; + } else { + // We have to open our own stream to the URL. + + // Set the new status + sourceType = INPUT_EXTERNAL; + url = new URL (systemId); + + externalEntity = url.openConnection (); + externalEntity.connect (); + is = externalEntity.getInputStream (); + } + + // If we get to here, there must be + // an InputStream available. + if (!is.markSupported ()) { + is = new BufferedInputStream (is); + } + + // Get any external encoding label. + if (encoding == null && externalEntity != null) { + // External labels can be untrustworthy; filesystems in + // particular often have the wrong default for content + // that wasn't locally originated. Those we autodetect. + if (!"file".equals (externalEntity.getURL ().getProtocol ())) { + int temp; + + // application/xml;charset=something;otherAttr=... + // ... with many variants on 'something' + encoding = externalEntity.getContentType (); + temp = encoding.indexOf ("charset"); + + // RFC 2376 sez MIME text defaults to ASCII, but since the + // JDK will create a MIME type out of thin air, we always + // autodetect when there's no explicit charset attribute. + if (temp < 0) + encoding = null; // autodetect + else { + temp = encoding.indexOf ('=', temp + 7); + encoding = encoding.substring (temp); + if ((temp = encoding.indexOf (';')) > 0) + encoding = encoding.substring (0, temp); + + // attributes can have comment fields (RFC 822) + if ((temp = encoding.indexOf ('(')) > 0) + encoding = encoding.substring (0, temp); + // ... and values may be quoted + if ((temp = encoding.indexOf ('"')) > 0) + encoding = encoding.substring (temp + 1, + encoding.indexOf ('"', temp + 2)); + encoding.trim (); + } + } + } + + // if we got an external encoding label, use it ... + if (encoding != null) { + this.encoding = ENCODING_EXTERNAL; + setupDecoding (encoding); + ignoreEncoding = true; + + // ... else autodetect + } else { + detectEncoding (); + ignoreEncoding = false; + } + + // Read any XML or text declaration. + tryEncodingDecl (ignoreEncoding); + } + + + /** + * Check for an encoding declaration. This is the second part of the + * XML encoding autodetection algorithm, relying on detectEncoding to + * get to the point that this part can read any encoding declaration + * in the document (using only US-ASCII characters). + * + *

Because this part starts to fill parser buffers with this data, + * it's tricky to to a reader so that Java's built-in decoders can be + * used for the character encodings that aren't built in to this parser + * (such as EUC-JP, KOI8-R, Big5, etc). + * + * @return any encoding in the declaration, uppercased; or null + * @see detectEncoding + */ + private String tryEncodingDecl (boolean ignoreEncoding) + throws SAXException, IOException + { + // Read the XML/text declaration. + if (tryRead (" 0) { + return parseTextDecl (ignoreEncoding); + } else { + return parseXMLDecl (ignoreEncoding); + } + } else { + unread ("xml".toCharArray (), 3); + parsePI (); + } + } + return null; + } + + + /** + * Attempt to detect the encoding of an entity. + *

The trick here (as suggested in the XML standard) is that + * any entity not in UTF-8, or in UCS-2 with a byte-order mark, + * must begin with an XML declaration or an encoding + * declaration; we simply have to look for "<?xml" in various + * encodings. + *

This method has no way to distinguish among 8-bit encodings. + * Instead, it sets up for UTF-8, then (possibly) revises its assumption + * later in setupDecoding (). Any ASCII-derived 8-bit encoding + * should work, but most will be rejected later by setupDecoding (). + *

I don't currently detect EBCDIC, since I'm concerned that it + * could also be a valid UTF-8 sequence; I'll have to do more checking + * later. + * @see #tryEncoding (byte[], byte, byte, byte, byte) + * @see #tryEncoding (byte[], byte, byte) + * @see #setupDecoding + * @see #read8bitEncodingDeclaration + */ + private void detectEncoding () + throws SAXException, IOException + { + byte signature[] = new byte [4]; + + // Read the first four bytes for + // autodetection. + is.mark (4); + is.read (signature); + is.reset (); + + // + // FIRST: four byte encodings (who uses these?) + // + if (tryEncoding (signature, (byte) 0x00, (byte) 0x00, + (byte) 0x00, (byte) 0x3c)) { + // UCS-4 must begin with "Utility routine for detectEncoding (). + *

Always looks for some part of "Looks for a UCS-2 byte-order mark. + *

Utility routine for detectEncoding (). + * @param sig The first four bytes read. + * @param b1 The first byte of the signature + * @param b2 The second byte of the signature + * @see #detectEncoding + */ + private static boolean tryEncoding (byte sig[], byte b1, byte b2) + { + return ((sig [0] == b1) && (sig [1] == b2)); + } + + + /** + * This method pushes a string back onto input. + *

It is useful either as the expansion of an internal entity, + * or for backtracking during the parse. + *

Call pushCharArray () to do the actual work. + * @param s The string to push back onto input. + * @see #pushCharArray + */ + private void pushString (String ename, String s) + throws SAXException + { + char ch[] = s.toCharArray (); + pushCharArray (ename, ch, 0, ch.length); + } + + + /** + * Push a new internal input source. + *

This method is useful for expanding an internal entity, + * or for unreading a string of characters. It creates a new + * readBuffer containing the characters in the array, instead + * of characters converted from an input byte stream. + * @param ch The char array to push. + * @see #pushString + * @see #pushURL + * @see #readBuffer + * @see #sourceType + * @see #pushInput + */ + private void pushCharArray (String ename, char ch[], int start, int length) + throws SAXException + { + // Push the existing status + pushInput (ename); + sourceType = INPUT_INTERNAL; + readBuffer = ch; + readBufferPos = start; + readBufferLength = length; + readBufferOverflow = -1; + } + + + /** + * Save the current input source onto the stack. + *

This method saves all of the global variables associated with + * the current input source, so that they can be restored when a new + * input source has finished. It also tests for entity recursion. + *

The method saves the following global variables onto a stack + * using a fixed-length array: + *

    + *
  1. sourceType + *
  2. externalEntity + *
  3. readBuffer + *
  4. readBufferPos + *
  5. readBufferLength + *
  6. line + *
  7. encoding + *
+ * @param ename The name of the entity (if any) causing the new input. + * @see #popInput + * @see #sourceType + * @see #externalEntity + * @see #readBuffer + * @see #readBufferPos + * @see #readBufferLength + * @see #line + * @see #encoding + */ + private void pushInput (String ename) + throws SAXException + { + Object input[] = new Object [12]; + + // Check for entity recursion. + if (ename != null) { + Enumeration entities = entityStack.elements (); + while (entities.hasMoreElements ()) { + String e = (String) entities.nextElement (); + if (e == ename) { + error ("recursive reference to entity", ename, null); + } + } + } + entityStack.push (ename); + + // Don't bother if there is no current input. + if (sourceType == INPUT_NONE) { + return; + } + + // Set up a snapshot of the current + // input source. + input [0] = new Integer (sourceType); + input [1] = externalEntity; + input [2] = readBuffer; + input [3] = new Integer (readBufferPos); + input [4] = new Integer (readBufferLength); + input [5] = new Integer (line); + input [6] = new Integer (encoding); + input [7] = new Integer (readBufferOverflow); + input [8] = is; + input [9] = new Integer (currentByteCount); + input [10] = new Integer (column); + input [11] = reader; + + // Push it onto the stack. + inputStack.push (input); + } + + + /** + * Restore a previous input source. + *

This method restores all of the global variables associated with + * the current input source. + * @exception java.io.EOFException + * If there are no more entries on the input stack. + * @see #pushInput + * @see #sourceType + * @see #externalEntity + * @see #readBuffer + * @see #readBufferPos + * @see #readBufferLength + * @see #line + * @see #encoding + */ + private void popInput () + throws SAXException, IOException + { + Object input[]; + + + switch (sourceType) { + + case INPUT_EXTERNAL: + if (externalEntity != null) { + handler.endExternalEntity ( + externalEntity.getURL ().toString ()); + } + break; + case INPUT_STREAM: + if (baseURI != null) { + handler.endExternalEntity (baseURI); + } + is.close (); + break; + case INPUT_READER: + if (baseURI != null) { + handler.endExternalEntity (baseURI); + } + reader.close (); + break; + } + + // Throw an EOFException if there + // is nothing else to pop. + if (inputStack.isEmpty ()) { + throw new EOFException ("no more input"); + } else { + String s; + input = (Object[]) inputStack.pop (); + s = (String) entityStack.pop (); + } + + sourceType = ((Integer) input [0]).intValue (); + externalEntity = (URLConnection) input [1]; + readBuffer = (char[]) input [2]; + readBufferPos = ((Integer) input [3]).intValue (); + readBufferLength = ((Integer) input [4]).intValue (); + line = ((Integer) input [5]).intValue (); + encoding = ((Integer) input [6]).intValue (); + readBufferOverflow = ((Integer) input [7]).intValue (); + is = (InputStream) input [8]; + currentByteCount = ((Integer) input [9]).intValue (); + column = ((Integer) input [10]).intValue (); + reader = (Reader) input [11]; + } + + + /** + * Return true if we can read the expected character. + *

Note that the character will be removed from the input stream + * on success, but will be put back on failure. Do not attempt to + * read the character again if the method succeeds. + * @param delim The character that should appear next. For a + * insensitive match, you must supply this in upper-case. + * @return true if the character was successfully read, or false if + * it was not. + * @see #tryRead (String) + */ + private boolean tryRead (char delim) + throws SAXException, IOException + { + char c; + + // Read the character + c = readCh (); + + // Test for a match, and push the character + // back if the match fails. + if (c == delim) { + return true; + } else { + unread (c); + return false; + } + } + + + /** + * Return true if we can read the expected string. + *

This is simply a convenience method. + *

Note that the string will be removed from the input stream + * on success, but will be put back on failure. Do not attempt to + * read the string again if the method succeeds. + *

This method will push back a character rather than an + * array whenever possible (probably the majority of cases). + *

NOTE: This method currently has a hard-coded limit + * of 100 characters for the delimiter. + * @param delim The string that should appear next. + * @return true if the string was successfully read, or false if + * it was not. + * @see #tryRead (char) + */ + private boolean tryRead (String delim) + throws SAXException, IOException + { + char ch[] = delim.toCharArray (); + char c; + + // Compare the input, character- + // by character. + + for (int i = 0; i < ch.length; i++) { + c = readCh (); + if (c != ch [i]) { + unread (c); + if (i != 0) { + unread (ch, i); + } + return false; + } + } + return true; + } + + + + /** + * Return true if we can read some whitespace. + *

This is simply a convenience method. + *

This method will push back a character rather than an + * array whenever possible (probably the majority of cases). + * @return true if whitespace was found. + */ + private boolean tryWhitespace () + throws SAXException, IOException + { + char c; + c = readCh (); + if (isWhitespace (c)) { + skipWhitespace (); + return true; + } else { + unread (c); + return false; + } + } + + + /** + * Read all data until we find the specified string. + * This is useful for scanning CDATA sections and PIs. + *

This is inefficient right now, since it calls tryRead () + * for every character. + * @param delim The string delimiter + * @see #tryRead (String, boolean) + * @see #readCh + */ + private void parseUntil (String delim) + throws SAXException, IOException + { + char c; + int startLine = line; + + try { + while (!tryRead (delim)) { + c = readCh (); + dataBufferAppend (c); + } + } catch (EOFException e) { + error ("end of input while looking for delimiter " + + "(started on line " + startLine + + ')', null, delim); + } + } + + + /** + * Read just the encoding declaration (or XML declaration) at the + * start of an external entity. + * When this method is called, we know that the declaration is + * present (or appears to be). We also know that the entity is + * in some sort of ASCII-derived 8-bit encoding. + * The idea of this is to let us read what the 8-bit encoding is + * before we've committed to converting any more of the file; the + * XML or encoding declaration must be in 7-bit ASCII, so we're + * safe as long as we don't go past it. + */ + private void read8bitEncodingDeclaration () + throws SAXException, IOException + { + int ch; + readBufferPos = readBufferLength = 0; + + while (true) { + ch = is.read (); + readBuffer [readBufferLength++] = (char) ch; + switch (ch) { + case (int) '>': + return; + case - 1: + error ("end of file before end of XML or encoding declaration.", + null, "?>"); + } + if (readBuffer.length == readBufferLength) + error ("unfinished XML or encoding declaration"); + } + } + + + ////////////////////////////////////////////////////////////////////// + // Low-level I/O. + ////////////////////////////////////////////////////////////////////// + + + /** + * Read a chunk of data from an external input source. + *

This is simply a front-end that fills the rawReadBuffer + * with bytes, then calls the appropriate encoding handler. + * @see #encoding + * @see #rawReadBuffer + * @see #readBuffer + * @see #filterCR + * @see #copyUtf8ReadBuffer + * @see #copyIso8859_1ReadBuffer + * @see #copyUcs_2ReadBuffer + * @see #copyUcs_4ReadBuffer + */ + private void readDataChunk () + throws SAXException, IOException + { + int count, i, j; + + // See if we have any overflow (filterCR sets for CR at end) + if (readBufferOverflow > -1) { + readBuffer [0] = (char) readBufferOverflow; + readBufferOverflow = -1; + readBufferPos = 1; + sawCR = true; + } else { + readBufferPos = 0; + sawCR = false; + } + + // input from a character stream. + if (sourceType == INPUT_READER) { + count = reader.read (readBuffer, + readBufferPos, READ_BUFFER_MAX - readBufferPos); + if (count < 0) + readBufferLength = readBufferPos; + else + readBufferLength = readBufferPos + count; + if (readBufferLength > 0) + filterCR (count >= 0); + sawCR = false; + return; + } + + // Read as many bytes as possible into the raw buffer. + count = is.read (rawReadBuffer, 0, READ_BUFFER_MAX); + + // Dispatch to an encoding-specific reader method to populate + // the readBuffer. In most parser speed profiles, these routines + // show up at the top of the CPU usage chart. + if (count > 0) { + switch (encoding) { + // one byte builtins + case ENCODING_ASCII: + copyIso8859_1ReadBuffer (count, (char) 0x0080); + break; + case ENCODING_UTF_8: + copyUtf8ReadBuffer (count); + break; + case ENCODING_ISO_8859_1: + copyIso8859_1ReadBuffer (count, (char) 0); + break; + + // two byte builtins + case ENCODING_UCS_2_12: + copyUcs2ReadBuffer (count, 8, 0); + break; + case ENCODING_UCS_2_21: + copyUcs2ReadBuffer (count, 0, 8); + break; + + // four byte builtins + case ENCODING_UCS_4_1234: + copyUcs4ReadBuffer (count, 24, 16, 8, 0); + break; + case ENCODING_UCS_4_4321: + copyUcs4ReadBuffer (count, 0, 8, 16, 24); + break; + case ENCODING_UCS_4_2143: + copyUcs4ReadBuffer (count, 16, 24, 0, 8); + break; + case ENCODING_UCS_4_3412: + copyUcs4ReadBuffer (count, 8, 0, 24, 16); + break; + } + } else + readBufferLength = readBufferPos; + + readBufferPos = 0; + + // Filter out all carriage returns if we've seen any + // (including any saved from a previous read) + if (sawCR) { + filterCR (count >= 0); + sawCR = false; + + // must actively report EOF, lest some CRs get lost. + if (readBufferLength == 0 && count >= 0) + readDataChunk (); + } + + if (count > 0) + currentByteCount += count; + } + + + /** + * Filter carriage returns in the read buffer. + * CRLF becomes LF; CR becomes LF. + * @param moreData true iff more data might come from the same source + * @see #readDataChunk + * @see #readBuffer + * @see #readBufferOverflow + */ + private void filterCR (boolean moreData) + { + int i, j; + + readBufferOverflow = -1; + +loop: + for (i = j = readBufferPos; j < readBufferLength; i++, j++) { + switch (readBuffer [j]) { + case '\r': + if (j == readBufferLength - 1) { + if (moreData) { + readBufferOverflow = '\r'; + readBufferLength--; + } else // CR at end of buffer + readBuffer [i++] = '\n'; + break loop; + } else if (readBuffer [j + 1] == '\n') { + j++; + } + readBuffer [i] = '\n'; + break; + + case '\n': + default: + readBuffer [i] = readBuffer [j]; + break; + } + } + readBufferLength = i; + } + + /** + * Convert a buffer of UTF-8-encoded bytes into UTF-16 characters. + *

When readDataChunk () calls this method, the raw bytes are in + * rawReadBuffer, and the final characters will appear in + * readBuffer. + * @param count The number of bytes to convert. + * @see #readDataChunk + * @see #rawReadBuffer + * @see #readBuffer + * @see #getNextUtf8Byte + */ + private void copyUtf8ReadBuffer (int count) + throws SAXException, IOException + { + int i = 0; + int j = readBufferPos; + int b1; + char c = 0; + + /* + // check once, so the runtime won't (if it's smart enough) + if (count < 0 || count > rawReadBuffer.length) + throw new ArrayIndexOutOfBoundsException (Integer.toString (count)); + */ + + while (i < count) { + b1 = rawReadBuffer [i++]; + + // Determine whether we are dealing + // with a one-, two-, three-, or four- + // byte sequence. + if (b1 < 0) { + if ((b1 & 0xe0) == 0xc0) { + // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx + c = (char) (((b1 & 0x1f) << 6) + | getNextUtf8Byte (i++, count)); + } else if ((b1 & 0xf0) == 0xe0) { + // 3-byte sequence: + // zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx + // most CJKV characters + c = (char) (((b1 & 0x0f) << 12) | + (getNextUtf8Byte (i++, count) << 6) | + getNextUtf8Byte (i++, count)); + } else if ((b1 & 0xf8) == 0xf0) { + // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx + // = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx + // (uuuuu = wwww + 1) + // "Surrogate Pairs" ... from the "Astral Planes" + int iso646 = b1 & 07; + iso646 = (iso646 << 6) + getNextUtf8Byte (i++, count); + iso646 = (iso646 << 6) + getNextUtf8Byte (i++, count); + iso646 = (iso646 << 6) + getNextUtf8Byte (i++, count); + + if (iso646 <= 0xffff) { + c = (char) iso646; + } else { + if (iso646 > 0x0010ffff) + encodingError ( + "UTF-8 value out of range for Unicode", + iso646, 0); + iso646 -= 0x010000; + readBuffer [j++] = (char) (0xd800 | (iso646 >> 10)); + readBuffer [j++] = (char) (0xdc00 | (iso646 & 0x03ff)); + continue; + } + } else { + // The five and six byte encodings aren't supported; + // they exceed the Unicode (and XML) range. + encodingError ( + "unsupported five or six byte UTF-8 sequence", + 0xff & b1, i); + // NOTREACHED + c = 0; + } + } else { + // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx + // (US-ASCII character, "common" case, one branch to here) + c = (char) b1; + } + readBuffer [j++] = c; + if (c == '\r') + sawCR = true; + } + // How many characters have we read? + readBufferLength = j; + } + + + /** + * Return the next byte value in a UTF-8 sequence. + * If it is not possible to get a byte from the current + * entity, throw an exception. + * @param pos The current position in the rawReadBuffer. + * @param count The number of bytes in the rawReadBuffer + * @return The significant six bits of a non-initial byte in + * a UTF-8 sequence. + * @exception EOFException If the sequence is incomplete. + */ + private int getNextUtf8Byte (int pos, int count) + throws SAXException, IOException + { + int val; + + // Take a character from the buffer + // or from the actual input stream. + if (pos < count) { + val = rawReadBuffer [pos]; + } else { + val = is.read (); + if (val == -1) { + encodingError ("unfinished multi-byte UTF-8 sequence at EOF", + -1, pos); + } + } + + // Check for the correct bits at the start. + if ((val & 0xc0) != 0x80) { + encodingError ("bad continuation of multi-byte UTF-8 sequence", + val, pos + 1); + } + + // Return the significant bits. + return (val & 0x3f); + } + + + /** + * Convert a buffer of US-ASCII or ISO-8859-1-encoded bytes into + * UTF-16 characters. + * + *

When readDataChunk () calls this method, the raw bytes are in + * rawReadBuffer, and the final characters will appear in + * readBuffer. + * + * @param count The number of bytes to convert. + * @param mask For ASCII conversion, 0x7f; else, 0xff. + * @see #readDataChunk + * @see #rawReadBuffer + * @see #readBuffer + */ + private void copyIso8859_1ReadBuffer (int count, char mask) + throws IOException + { + int i, j; + for (i = 0, j = readBufferPos; i < count; i++, j++) { + char c = (char) (rawReadBuffer [i] & 0xff); + if ((c & mask) != 0) + throw new CharConversionException ("non-ASCII character U+" + + Integer.toHexString (c)); + readBuffer [j] = c; + if (c == '\r') { + sawCR = true; + } + } + readBufferLength = j; + } + + + /** + * Convert a buffer of UCS-2-encoded bytes into UTF-16 characters + * (as used in Java string manipulation). + * + *

When readDataChunk () calls this method, the raw bytes are in + * rawReadBuffer, and the final characters will appear in + * readBuffer. + * @param count The number of bytes to convert. + * @param shift1 The number of bits to shift byte 1. + * @param shift2 The number of bits to shift byte 2 + * @see #readDataChunk + * @see #rawReadBuffer + * @see #readBuffer + */ + private void copyUcs2ReadBuffer (int count, int shift1, int shift2) + throws SAXException + { + int j = readBufferPos; + + if (count > 0 && (count % 2) != 0) { + encodingError ("odd number of bytes in UCS-2 encoding", -1, count); + } + // The loops are faster with less internal brancing; hence two + if (shift1 == 0) { // "UTF-16-LE" + for (int i = 0; i < count; i += 2) { + char c = (char) (rawReadBuffer [i + 1] << 8); + c |= 0xff & rawReadBuffer [i]; + readBuffer [j++] = c; + if (c == '\r') + sawCR = true; + } + } else { // "UTF-16-BE" + for (int i = 0; i < count; i += 2) { + char c = (char) (rawReadBuffer [i] << 8); + c |= 0xff & rawReadBuffer [i + 1]; + readBuffer [j++] = c; + if (c == '\r') + sawCR = true; + } + } + readBufferLength = j; + } + + + /** + * Convert a buffer of UCS-4-encoded bytes into UTF-16 characters. + * + *

When readDataChunk () calls this method, the raw bytes are in + * rawReadBuffer, and the final characters will appear in + * readBuffer. + *

Java has Unicode chars, and this routine uses surrogate pairs + * for ISO-10646 values between 0x00010000 and 0x000fffff. An + * exception is thrown if the ISO-10646 character has no Unicode + * representation. + * + * @param count The number of bytes to convert. + * @param shift1 The number of bits to shift byte 1. + * @param shift2 The number of bits to shift byte 2 + * @param shift3 The number of bits to shift byte 2 + * @param shift4 The number of bits to shift byte 2 + * @see #readDataChunk + * @see #rawReadBuffer + * @see #readBuffer + */ + private void copyUcs4ReadBuffer (int count, int shift1, int shift2, + int shift3, int shift4) + throws SAXException + { + int j = readBufferPos; + int value; + + if (count > 0 && (count % 4) != 0) { + encodingError ( + "number of bytes in UCS-4 encoding not divisible by 4", + -1, count); + } + for (int i = 0; i < count; i += 4) { + value = (((rawReadBuffer [i] & 0xff) << shift1) | + ((rawReadBuffer [i + 1] & 0xff) << shift2) | + ((rawReadBuffer [i + 2] & 0xff) << shift3) | + ((rawReadBuffer [i + 3] & 0xff) << shift4)); + if (value < 0x0000ffff) { + readBuffer [j++] = (char) value; + if (value == (int) '\r') { + sawCR = true; + } + } else if (value < 0x0010ffff) { + value -= 0x010000; + readBuffer [j++] = (char) (0xd8 | ((value >> 10) & 0x03ff)); + readBuffer [j++] = (char) (0xdc | (value & 0x03ff)); + } else { + encodingError ("UCS-4 value out of range for Unicode", + value, i); + } + } + readBufferLength = j; + } + + + /** + * Report a character encoding error. + */ + private void encodingError (String message, int value, int offset) + throws SAXException + { + String uri; + + if (value != -1) { + message = message + " (character code: 0x" + + Integer.toHexString (value) + ')'; + } + if (externalEntity != null) { + uri = externalEntity.getURL ().toString (); + } else { + uri = baseURI; + } + handler.error (message, uri, -1, offset + currentByteCount); + } + + + ////////////////////////////////////////////////////////////////////// + // Local Variables. + ////////////////////////////////////////////////////////////////////// + + /** + * Re-initialize the variables for each parse. + */ + private void initializeVariables () + { + // First line + line = 1; + column = 0; + + // Set up the buffers for data and names + dataBufferPos = 0; + dataBuffer = new char [DATA_BUFFER_INITIAL]; + nameBufferPos = 0; + nameBuffer = new char [NAME_BUFFER_INITIAL]; + + // Set up the DTD hash tables + elementInfo = new Hashtable (); + entityInfo = new Hashtable (); + notationInfo = new Hashtable (); + + // Set up the variables for the current + // element context. + currentElement = null; + currentElementContent = CONTENT_UNDECLARED; + + // Set up the input variables + sourceType = INPUT_NONE; + inputStack = new Stack (); + entityStack = new Stack (); + externalEntity = null; + tagAttributePos = 0; + tagAttributes = new String [100]; + rawReadBuffer = new byte [READ_BUFFER_MAX]; + readBufferOverflow = -1; + + inLiteral = false; + expandPE = false; + peIsError = false; + + inCDATA = false; + + symbolTable = new Object [SYMBOL_TABLE_LENGTH][]; + +// + showDocStructure = false; + translateExternalParsedEntities = false; + translateComments = false; + + try { + showDocStructure = ((String)handler.getProperty("http://4xt.org/inclusions/showDocStructure")).equals("yes"); + } catch (SAXNotRecognizedException e) { + showDocStructure = false; +// System.err.println(e.toString()); + } + + if (showDocStructure) { + translateExternalParsedEntities=true; + translateComments=true; + } else { + try { + translateExternalParsedEntities = ((String)handler.getProperty("http://4xt.org/inclusions/translateExternalParsedEntities")).equals("yes"); + } catch (SAXNotRecognizedException e) { + translateExternalParsedEntities = false; + } + try { + translateComments = ((String)handler.getProperty("http://4xt.org/inclusions/translateComments")).equals("yes"); + } catch (SAXNotRecognizedException e) { + translateComments = false; + } + } + + System.err.println("showDocStructure : "+showDocStructure); + System.err.println("translateExternalParsedEntities : "+translateExternalParsedEntities); + System.err.println("translateComments : "+translateComments); +// + + } + + + /** + * Clean up after the parse to allow some garbage collection. + */ + private void cleanupVariables () + { + dataBuffer = null; + nameBuffer = null; + + elementInfo = null; + entityInfo = null; + notationInfo = null; + + currentElement = null; + + inputStack = null; + entityStack = null; + externalEntity = null; + + tagAttributes = null; + rawReadBuffer = null; + + symbolTable = null; + } + + // + // The current XML handler interface. + // + private StructSaxDriver handler; + + // + // I/O information. + // + private Reader reader; // current reader + private InputStream is; // current input stream + private int line; // current line number + private int column; // current column number + private int sourceType; // type of input source + private Stack inputStack; // stack of input soruces + private URLConnection externalEntity; // current external entity + private int encoding; // current character encoding + private int currentByteCount; // bytes read from current source + + // + // Buffers for decoded but unparsed character input. + // + private char readBuffer []; + private int readBufferPos; + private int readBufferLength; + private int readBufferOverflow; // overflow from last data chunk. + + + // + // Buffer for undecoded raw byte input. + // + private final static int READ_BUFFER_MAX = 16384; + private byte rawReadBuffer []; + + + // + // Buffer for parsed character data. + // + private static int DATA_BUFFER_INITIAL = 4096; + private char dataBuffer []; + private int dataBufferPos; + + // + // Buffer for parsed names. + // + private static int NAME_BUFFER_INITIAL = 1024; + private char nameBuffer []; + private int nameBufferPos; + + + // + // Hashtables for DTD information on elements, entities, and notations. + // + private Hashtable elementInfo; + private Hashtable entityInfo; + private Hashtable notationInfo; + + + // + // Element type currently in force. + // + private String currentElement; + private int currentElementContent; + + // + // Base external identifiers for resolution. + // + private String basePublicId; + private String baseURI; + private int baseEncoding; + private Reader baseReader; + private InputStream baseInputStream; + private char baseInputBuffer []; + private int baseInputBufferStart; + private int baseInputBufferLength; + + // + // Stack of entity names, to detect recursion. + // + private Stack entityStack; + + // + // PE expansion is enabled in most chunks of the DTD, not all. + // When it's enabled, literals are treated differently. + // + private boolean inLiteral; + private boolean expandPE; + private boolean peIsError; + + // + // Symbol table, for caching interned names. + // + private final static int SYMBOL_TABLE_LENGTH = 1087; + private Object symbolTable [][]; + + // + // Hash table of attributes found in current start tag. + // + private String tagAttributes []; + private int tagAttributePos; + + // + // Utility flag: have we noticed a CR while reading the last + // data chunk? If so, we will have to go back and normalise + // CR or CR/LF line ends. + // + private boolean sawCR; + + // + // Utility flag: are we in CDATA? If so, whitespace isn't ignorable. + // + private boolean inCDATA; +// + // + // Flags to define if we should show the document structure and if + // we should expend the external parsed entities + // + private boolean showDocStructure; + private boolean translateExternalParsedEntities; + private boolean translateComments; + + String version=null; + String encodingName = null; + String standalone = null; +// + + +} diff --git a/saxon-add-fixes-from-com-isl-saxon-aelfred.patch b/saxon-add-fixes-from-com-isl-saxon-aelfred.patch new file mode 100644 index 0000000..9dc14ef --- /dev/null +++ b/saxon-add-fixes-from-com-isl-saxon-aelfred.patch @@ -0,0 +1,1072 @@ +--- XmlParser.java.old 2012-12-04 12:58:09.544956168 +0100 ++++ XmlParser.java 2012-12-04 15:28:20.798929872 +0100 +@@ -62,10 +62,12 @@ + // The modification over the original source are flagged by + // tags. + +-// +-// package org.brownell.xml.aelfred2; +-import org.brownell.xml.aelfred2.*; +-// ++// removed all modification and adapt the code to be able to replace ++// the GPL com/icl/saxo/aelfred/XmlParser.java, which is not compatible with ++// MPL license of the rest ++// mvyskocil@suse.com ++ ++package com.icl.saxon.aelfred; + + import java.io.BufferedInputStream; + import java.io.CharConversionException; +@@ -81,27 +83,20 @@ + import java.util.Stack; + + import org.xml.sax.SAXException; +-// +-import org.xml.sax.SAXNotRecognizedException; +-// +- + // $Id: XmlParser.java,v 1.20 2000/05/29 12:10:24 mojo Exp $ + + /** + * Parse XML documents and return parse events through call-backs. +- * Use the StructSaxDriver class as your entry point, as all ++ * Use the SAXDriver class as your entry point, as all + * internal parser interfaces are subject to change. + * + * @author Written by David Megginson <dmeggins@microstar.com> + * (version 1.2a with bugfixes) + * @author Updated by David Brownell <david-b@pacbell.net> + * @version $Date: 2000/05/29 12:10:24 $ +- * @see StructSaxDriver ++ * @see SAXDriver + */ +-// +-// final class XmlParser +-final class StructXmlParser +-// ++final class XmlParser + { + // parse from buffer, avoiding slow per-character readCh() + private final static boolean USE_CHEATS = true; +@@ -121,10 +116,7 @@ + * @see #parse + */ + // package private +-// +-// XmlParser () +- StructXmlParser () +-// ++ XmlParser () + { + cleanupVariables (); + } +@@ -136,10 +128,7 @@ + * @see #parse + */ + // package private +-// +-// final class XmlParser +-// void setHandler (SaxDriver handler) +- void setHandler (StructSaxDriver handler) ++ void setHandler (SAXDriver handler) + // + { + this.handler = handler; +@@ -194,7 +183,7 @@ + handler.startDocument (); + + pushURL ("[document]", basePublicId, baseURI, +- baseReader, baseInputStream, encoding); ++ baseReader, baseInputStream, encoding, false); + + try { + parseDocument (); +@@ -461,6 +450,8 @@ + private final static int LIT_DISABLE_EREF = 64; + // don't expand general entities, but make sure we _could_ + private final static int LIT_ENTITY_CHECK = 128; ++ // literal is a public ID value ++ private final static int LIT_PUBID = 256; + + + // +@@ -480,7 +471,7 @@ + * Report an error. + * @param message The error message. + * @param textFound The text that caused the error (or null). +- * @see StructSaxDriver#error ++ * @see SAXDriver#error + * @see #line + */ + private void error (String message, String textFound, String textExpected) +@@ -543,47 +534,13 @@ + { + char c; + +-// +- if (showDocStructure) { +- +- // +- // Struct: updated to show the comments +- // +- +- handler.attribute ("xmlns:str", "http://4xt.org/ns/xmlstructure", false); +- handler.startElement ("str:document"); +- } +-// +- ++ try { + parseProlog (); +- +-// +- if (showDocStructure) { +- +- // +- // Struct: updated to show the document structure +- // +- +- handler.startElement ("str:body"); +- +- } +-// +- + require ('<'); + parseElement (); +- +-// +- if (showDocStructure) { +- +- // +- // Struct: updated to show the document structure +- // +- +- handler.endElement ("str:body"); +- handler.startElement ("str:epilog"); +- +- } +-// ++ } catch (EOFException eofe) { ++ error("premature end of file"); ++ } + + try { + parseMisc (); //skip all white, PIs, and comments +@@ -593,19 +550,6 @@ + return; + } + +-// +- if (showDocStructure) { +- +- // +- // Struct: updated to show the comments +- // +- +- handler.endElement ("str:epilog"); +- handler.endElement ("str:document"); +- } +-// +- +- + } + + +@@ -626,23 +570,7 @@ + parseUntil ("--"); + require ('>'); + expandPE = saved; +-// +- if (! translateComments ) { +-// +- handler.comment (dataBuffer, 0, dataBufferPos); +-// +- } else { +- // +- // Struct: updated to show the comments +- // +- System.err.println("Comment : " ); +- handler.attribute ("xmlns:str", "http://4xt.org/ns/xmlstructure", false); +- handler.startElement ("str:comment"); +- handler.charData (dataBuffer, 0, dataBufferPos); +- handler.endElement ("str:comment"); +- +- } +-// ++ handler.comment (dataBuffer, 0, dataBufferPos); + dataBufferPos = 0; + } + +@@ -711,45 +639,12 @@ + private void parseProlog () + throws Exception + { +- +-// +- if (showDocStructure) { +- +- // +- // Struct: updated to show the prolog +- // +- +- handler.startElement ("str:prolog"); +- if (version != null) +- handler.attribute ("str:version", version, false); +- if (encodingName != null) +- handler.attribute ("str:encoding", encodingName, false); +- if (standalone != null) +- handler.attribute ("str:standalone", standalone, false); +- handler.startElement ("str:X-M-L-Decl"); +- handler.endElement ("str:X-M-L-Decl"); +- +- } +-// +- + parseMisc (); + + if (tryRead (" +- if (showDocStructure) { +- +- // +- // Struct: updated to show the prolog +- // +- +- handler.endElement ("str:prolog"); +- } +-// +- + } + + +@@ -775,12 +670,14 @@ + throws SAXException, IOException + { + boolean white; ++ String encodingName = null; ++ String standalone; + int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; + + // Read the version. + require ("version"); + parseEq (); +- version = readLiteral (flags); ++ String version = readLiteral (flags); + if (!version.equals ("1.0")) { + error ("unsupported XML version", version, "1.0"); + } +@@ -905,10 +802,8 @@ + encoding = ENCODING_UTF_8; + return; + } else if (encoding != ENCODING_EXTERNAL) { +- // fatal error +- error ("unsupported ASCII-derived encoding", +- encodingName, +- "UTF-8, US-ASCII, or ISO-8859-1"); ++ // used to start with a new reader ... ++ throw new EncodingException(encodingName); + } + // else fallthrough ... + // it's ASCII-ish and something other than a builtin +@@ -962,7 +857,7 @@ + + reader = new InputStreamReader (is, encodingName); + sourceType = INPUT_READER; +- is = null; ++ //is = null; + } + + +@@ -1014,24 +909,6 @@ + // report (a) declaration of name, (b) lexical info (ids) + handler.doctypeDecl (doctypeName, ids [0], ids [1]); + +-// +- if (showDocStructure) { +- +- // +- // Struct: updated to show the doctype +- // +- +- if (doctypeName != null) +- handler.attribute ("str:name", doctypeName, false); +- if (ids [0] != null) +- handler.attribute ("str:publicId", ids [0], false); +- if (ids [1] != null) +- handler.attribute ("str:systemId", ids [1], false); +- handler.startElement ("str:doctype"); +- +- } +-// +- + // Internal subset is parsed first, if present + skipWhitespace (); + if (tryRead ('[')) { +@@ -1054,7 +931,7 @@ + + // Read the external subset, if any + if (ids [1] != null) { +- pushURL ("[external subset]", ids [0], ids [1], null, null, null); ++ pushURL ("[external subset]", ids [0], ids [1], null, null, null, false); + + // Loop until we end up back at '>' + while (true) { +@@ -1078,20 +955,6 @@ + // done dtd + handler.endDoctype (); + expandPE = false; +- +-// +- if (showDocStructure) { +- +- // +- // Struct: updated to show the doctype +- // +- +- handler.endElement ("str:doctype"); +- +- } +-// +- +- + } + + +@@ -1191,9 +1054,10 @@ + } + } + // I guess not... +- handler.attribute (aname, +- getAttributeExpandedValue (gi, aname), +- false); ++ String foo = getAttributeExpandedValue (gi, aname); ++ if ( foo != null) { ++ handler.attribute (aname, foo, false); ++ } + } + } + +@@ -1224,7 +1088,7 @@ + * [41] Attribute ::= Name Eq AttValue + * + * @param name The name of the attribute's element. +- * @see StructSaxDriver#attribute ++ * @see SAXDriver#attribute + */ + private void parseAttribute (String name) + throws Exception +@@ -1321,16 +1185,7 @@ + char c; + + while (true) { +- +- switch (currentElementContent) { +- case CONTENT_ANY: +- case CONTENT_MIXED: +- parseCharData (); +- break; +- case CONTENT_ELEMENTS: +- parseWhitespace (); +- break; +- } ++ parseCharData (); + + // Handle delimiters + c = readCh (); +@@ -1425,10 +1280,10 @@ + throws Exception + { + if (tryRead ("EMPTY")) { +- setElement (name, CONTENT_EMPTY, null); ++ setElement (name, CONTENT_EMPTY, null, null); + return; + } else if (tryRead ("ANY")) { +- setElement (name, CONTENT_ANY, null); ++ setElement (name, CONTENT_ANY, null, null); + return; + } else { + require ('('); +@@ -1437,10 +1292,10 @@ + if (tryRead ("#PCDATA")) { + dataBufferAppend ("#PCDATA"); + parseMixed (); +- setElement (name, CONTENT_MIXED, dataBufferToString ()); ++ setElement (name, CONTENT_MIXED, dataBufferToString (), null); + } else { + parseElements (); +- setElement (name, CONTENT_ELEMENTS, dataBufferToString ()); ++ setElement (name, CONTENT_ELEMENTS, dataBufferToString (), null); + } + } + } +@@ -1624,7 +1479,7 @@ + { + String name; + int type; +- String enum = null; ++ String enum2 = null; + + // Read the attribute name. + name = readNmtoken (true); +@@ -1636,12 +1491,12 @@ + // Get the string of enumerated values + // if necessary. + if (type == ATTRIBUTE_ENUMERATED || type == ATTRIBUTE_NOTATION) { +- enum = dataBufferToString (); ++ enum2 = dataBufferToString (); + } + + // Read the default value. + requireWhitespace (); +- parseDefault (elementName, name, type, enum); ++ parseDefault (elementName, name, type, enum2); + } + + +@@ -1739,12 +1594,14 @@ + String elementName, + String name, + int type, +- String enum ++ String enum2 + ) throws Exception + { + int valueType = ATTRIBUTE_DEFAULT_SPECIFIED; + String value = null; +- int flags = LIT_ATTRIBUTE | LIT_DISABLE_CREF | LIT_ENTITY_CHECK; ++ int flags = LIT_ATTRIBUTE | LIT_DISABLE_CREF | LIT_ENTITY_CHECK | LIT_DISABLE_PE; ++ // ^^^^^^^^^^^^^^ ++ // added MHK 20 Mar 2002 + + // Note: char refs not checked here, and input not normalized, + // since it's done correctly later when we actually expand any +@@ -1769,7 +1626,7 @@ + } + } else + value = readLiteral (flags); +- setAttribute (elementName, name, type, enum, value, valueType); ++ setAttribute (elementName, name, type, enum2, value, valueType); + } + + +@@ -1946,51 +1803,35 @@ + + name = readNmtoken (true); + require (';'); +-// +- if (! translateExternalParsedEntities) { +-// +- switch (getEntityType (name)) { +- case ENTITY_UNDECLARED: +- error ("reference to undeclared entity", name, null); +- break; +- case ENTITY_INTERNAL: +- System.err.println("Internal"); +- pushString (name, getEntityValue (name)); +- break; +- case ENTITY_TEXT: +- System.err.println("Text"); +- if (externalAllowed) { +- pushURL (name, getEntityPublicId (name), +- getEntitySystemId (name), +- null, null, null); +- } else { +- error ("reference to external entity in attribute value.", +- name, null); +- } +- break; +- case ENTITY_NDATA: +- System.err.println("NDATA"); +- if (externalAllowed) { +- error ("unparsed entity reference in content", name, null); +- } else { +- error ("reference to external entity in attribute value.", +- name, null); +- } +- break; +- } +-// +- } else { +- // +- // Struct: updated to show the entity call +- // +- System.err.println("Entity reference : " + name); +- handler.attribute ("xmlns:str", "http://4xt.org/ns/xmlstructure", false); +- handler.attribute ("str:name", name, false); +- handler.startElement ("str:entity"); +- handler.endElement ("str:entity"); +- +- } +- // ++ switch (getEntityType (name)) { ++ case ENTITY_UNDECLARED: ++ error ("reference to undeclared entity", name, null); ++ break; ++ case ENTITY_INTERNAL: ++ pushString (name, getEntityValue (name)); ++ System.err.println("Internal"); ++ break; ++ case ENTITY_TEXT: ++ System.err.println("Text"); ++ if (externalAllowed) { ++ pushURL (name, getEntityPublicId (name), ++ getEntitySystemId (name), ++ null, null, null, true); ++ } else { ++ error ("reference to external entity in attribute value.", ++ name, null); ++ } ++ break; ++ case ENTITY_NDATA: ++ System.err.println("NDATA"); ++ if (externalAllowed) { ++ error ("unparsed entity reference in content", name, null); ++ } else { ++ error ("reference to external entity in attribute value.", ++ name, null); ++ } ++ break; ++ } + } + + +@@ -2027,7 +1868,7 @@ + pushString (null, " "); + pushURL (name, getEntityPublicId (name), + getEntitySystemId (name), +- null, null, null); ++ null, null, null, true); + if (!inLiteral) + pushString (null, " "); + break; +@@ -2096,24 +1937,6 @@ + notationName = readNmtoken (true); + setExternalDataEntity (name, ids [0], ids [1], notationName); + } else { +- +-// +- if (showDocStructure) { +- +- // +- // Struct: updated to show the external entities definitions +- // +- +- handler.attribute ("str:name", name, false); +- handler.attribute ("str:type", ids [0], false); +- handler.attribute ("str:systemId", ids [1], false); +- handler.startElement ("str:externalEntityDefinition"); +- handler.endElement ("str:externalEntityDefinition"); +- +- } +-// +- +- + setExternalTextEntity (name, ids [0], ids [1]); + } + } +@@ -2222,6 +2045,7 @@ + + // OK, the cheat didn't work; start over + // and do it by the book. ++ int closeSquareBracketCount = 0; + while (true) { + c = readCh (); + switch (c) { +@@ -2229,8 +2053,19 @@ + case '&': + unread (c); + return; +- // XXX "]]>" precluded ... ++ case ']': ++ closeSquareBracketCount++; ++ dataBufferAppend(c); ++ break; ++ case '>': ++ if (closeSquareBracketCount>=2) { ++ // we've hit ']]>' ++ error ("']]>' is not allowed here"); ++ break; ++ } ++ // fall-through + default: ++ closeSquareBracketCount=0; + dataBufferAppend (c); + break; + } +@@ -2486,6 +2321,9 @@ + // Can't escape this normalization for attributes + case '\n': + case '\r': ++ if ((flags & (LIT_ATTRIBUTE | LIT_PUBID)) != 0) ++ c = ' '; ++ break; + case '\t': + if ((flags & LIT_ATTRIBUTE) != 0) + c = ' '; +@@ -2497,7 +2335,6 @@ + if (c == '#') { + if ((flags & LIT_DISABLE_CREF) != 0) { + dataBufferAppend ('&'); +- dataBufferAppend ('#'); + continue; + } + parseCharRef (); +@@ -2578,7 +2415,7 @@ + + if (tryRead ("PUBLIC")) { + requireWhitespace (); +- ids [0] = readLiteral (LIT_NORMALIZE | flags); ++ ids [0] = readLiteral (LIT_NORMALIZE | LIT_PUBID | flags); + if (inNotation) { + skipWhitespace (); + c = readCh (); +@@ -2987,30 +2824,32 @@ + * [2] attribute hash table + */ + private Object [] +- setElement (String name, int contentType, String contentModel) ++ setElement (String name, int contentType, String contentModel, Hashtable attributes) throws Exception + { +- Object element [] = (Object []) elementInfo.get (name); ++ Object element[] = (Object []) elementInfo.get (name); + + // first or for this type + if (element == null) { + element = new Object [3]; + element [0] = new Integer (contentType); + element [1] = contentModel; +- element [2] = new Hashtable (DEFAULT_ATTR_COUNT); ++ element [2] = attributes; + elementInfo.put (name, element); + return element; + } + + // multiple declarations ++ if (contentType != CONTENT_UNDECLARED) { + if (((Integer) element [0]).intValue () != CONTENT_UNDECLARED) { +- // warn ("multiple declarations for element type", name, null); +- return element; +- } +- + // after associated + element [0] = new Integer (contentType); + element [1] = contentModel; ++ } + ++ } ++ else if (attributes != null) { ++ element[2] = attributes; ++ } + return element; + } + +@@ -3023,9 +2862,9 @@ + { + Object element[] = (Object[]) elementInfo.get (name); + +- if (element == null) +- element = setElement (name, CONTENT_UNDECLARED, null); +- return (Hashtable) element [2]; ++ if (element == null) return null; ++ ++ return (Hashtable) element [2]; + } + + +@@ -3211,6 +3050,9 @@ + + // Create a new hashtable if necessary. + attlist = getElementAttributes (elName); ++ if (attlist == null) { ++ attlist = new Hashtable (); ++ } + + // ignore multiple attribute declarations! + if (attlist.get (name) != null) { +@@ -3224,6 +3066,9 @@ + attribute [3] = enumeration; + attribute [4] = null; + attlist.put (name, attribute); ++ ++ // save; but don't overwrite any existing ++ setElement (elName, CONTENT_UNDECLARED, null, attlist); + } + } + +@@ -3321,8 +3166,18 @@ + if (entity == null) { + return null; + } else { +- return (String) entity [2]; +- } ++ try { ++ String relativeURI = (String)entity [2]; ++ URL baseURI = (URL)entity [5]; ++ if (baseURI==null) return relativeURI; ++ URL absoluteURI = new URL( baseURI, relativeURI ); ++ return absoluteURI.toString(); ++ } catch (IOException err) { ++ // ignore the exception, a user entity resolver may be able ++ // to do something; if not, the error will be caught later ++ return (String)entity [2]; ++ } ++ } + } + + +@@ -3402,12 +3257,14 @@ + Object entity[]; + + if (entityInfo.get (eName) == null) { +- entity = new Object [5]; ++ entity = new Object [6]; + entity [0] = new Integer (eClass); + entity [1] = pubid; + entity [2] = sysid; + entity [3] = value; + entity [4] = nName; ++ entity [5] = (externalEntity == null ? null : externalEntity.getURL()); ++ // added MHK: provides base URI for resolution + + entityInfo.put (eName, entity); + } +@@ -3554,7 +3411,6 @@ + while (readBufferPos >= readBufferLength) { + switch (sourceType) { + case INPUT_READER: +- case INPUT_EXTERNAL: + case INPUT_STREAM: + readDataChunk (); + while (readBufferLength < 1) { +@@ -3578,9 +3434,9 @@ + line++; + column = 0; + } else { +- if (c == '<') ++ if (c == '<') { + /* favorite return to parseContent () .. NOP */ ; +- else if ((c < 0x0020 && (c != '\t') && (c != '\r')) || c > 0xFFFD) ++ }else if ((c < 0x0020 && (c != '\t') && (c != '\r')) || c > 0xFFFD) + error ("illegal XML character U+" + + Integer.toHexString (c)); + +@@ -3589,7 +3445,7 @@ + // are also spots in the internal subset where PE refs are fatal + // errors, hence yet another flag. + else if (c == '%' && expandPE) { +- if (peIsError) ++ if (peIsError && entityStack.size()==1) + error ("PE reference within decl in internal subset."); + parsePEReference (); + return readCh (); +@@ -3669,7 +3525,7 @@ + * request an encoding explicitly, and it should also look at the + * headers with an HTTP connection. + * @param url The java.net.URL object for the entity. +- * @see StructSaxDriver#resolveEntity ++ * @see SAXDriver#resolveEntity + * @see #pushString + * @see #sourceType + * @see #pushInput +@@ -3683,7 +3539,8 @@ + String systemId, + Reader reader, + InputStream stream, +- String encoding ++ String encoding, ++ boolean isAbsolute + ) throws SAXException, IOException + { + URL url; +@@ -3700,28 +3557,43 @@ + readBufferOverflow = -1; + is = null; + line = 1; ++ column = 0; + + currentByteCount = 0; + ++ if (!isAbsolute) { + // Make any system ID (URI/URL) absolute. There's one case + // where it may be null: parser was invoked without providing + // one, e.g. since the XML data came from a memory buffer. + ++ try { + if (systemId != null && externalEntity != null) { + systemId = new URL (externalEntity.getURL (), systemId).toString (); + } else if (baseURI != null) { + systemId = new URL (new URL (baseURI), systemId).toString (); + // throws IOException if couldn't create new URL + } ++ } catch(java.io.IOException ioe) { ++ popInput(); ++ error("Invalid URL " + systemId + "\n" + ioe.getMessage()); ++ } ++ } + + // See if the application wants to + // redirect the system ID and/or + // supply its own character stream. + if (reader == null && stream == null && systemId != null) { +- Object input = handler.resolveEntity (publicId, systemId); ++ Object input = null; ++ try { ++ input = handler.resolveEntity (publicId, systemId); ++ } catch (java.io.IOException ioe){ ++ popInput(); ++ error("Failure resolving entity " + systemId + "\n" + ioe.getMessage()); ++ } + if (input != null) { + if (input instanceof String) { + systemId = (String) input; ++ isAbsolute = true; + } else if (input instanceof InputStream) { + stream = (InputStream) input; + } else if (input instanceof Reader) { +@@ -3748,20 +3620,23 @@ + + // Else we handle the conversion, and need to ensure + // it's done right. ++ sourceType = INPUT_STREAM; + if (stream != null) { +- sourceType = INPUT_STREAM; + is = stream; + url = null; + } else { + // We have to open our own stream to the URL. + +- // Set the new status +- sourceType = INPUT_EXTERNAL; + url = new URL (systemId); + ++ try { + externalEntity = url.openConnection (); + externalEntity.connect (); + is = externalEntity.getInputStream (); ++ } catch (java.io.IOException ioe){ ++ popInput(); ++ error("Cannot read from " + systemId + "\n" + ioe.getMessage()); ++ } + } + + // If we get to here, there must be +@@ -3781,7 +3656,7 @@ + // application/xml;charset=something;otherAttr=... + // ... with many variants on 'something' + encoding = externalEntity.getContentType (); +- temp = encoding.indexOf ("charset"); ++ temp = (encoding != null) ? encoding.indexOf ("charset") : -1; + + // RFC 2376 sez MIME text defaults to ASCII, but since the + // JDK will create a MIME type out of thin air, we always +@@ -3790,7 +3665,7 @@ + encoding = null; // autodetect + else { + temp = encoding.indexOf ('=', temp + 7); +- encoding = encoding.substring (temp); ++ encoding = encoding.substring (temp+1); + if ((temp = encoding.indexOf (';')) > 0) + encoding = encoding.substring (0, temp); + +@@ -3817,9 +3692,39 @@ + detectEncoding (); + ignoreEncoding = false; + } ++ is.mark(100); + + // Read any XML or text declaration. +- tryEncodingDecl (ignoreEncoding); ++ try { ++ tryEncodingDecl (ignoreEncoding); ++ } catch (EncodingException x) { ++ encoding = x.getMessage (); ++ ++ // if we don't handle the declared encoding, ++ // try letting a JVM InputStreamReader do it ++ try { ++ if (sourceType != INPUT_STREAM) ++ throw x; ++ ++ is.reset (); ++ readBufferPos = 0; ++ readBufferLength = 0; ++ readBufferOverflow = -1; ++ line = 1; ++ currentByteCount = column = 0; ++ ++ sourceType = INPUT_READER; ++ this.reader = new InputStreamReader (is, encoding); ++ is = null; ++ ++ tryEncodingDecl (true); ++ ++ } catch (IOException e) { ++ error ("unsupported text encoding", ++ encoding, ++ null); ++ } ++ } + } + + +@@ -3916,6 +3821,14 @@ + // ff fe 00 00 UCS_4_4321 (with BOM) + } + ++ // SECOND: three byte signature: ++ // look for UTF-8 byte order mark 3C 3F 78, allowed by XML 1.0 2nd edition ++ ++ else if (tryEncoding (signature, (byte)0xef, (byte)0xbb, (byte)0xbf)) { ++ encoding = ENCODING_UTF_8; ++ is.read(); is.read(); is.read(); ++ } ++ + // + // SECOND: two byte encodings + // note ... with 1/14/2000 errata the XML spec identifies some +@@ -4002,6 +3915,20 @@ + return ((sig [0] == b1) && (sig [1] == b2)); + } + ++ /** ++ * Check for a three-byte signature. ++ *

Looks for a UTF-8 byte-order mark. ++ *

Utility routine for detectEncoding (). ++ * @param sig The first four bytes read. ++ * @param b1 The first byte of the signature ++ * @param b2 The second byte of the signature ++ * @param b3 The second byte of the signature ++ * @see #detectEncoding ++ */ ++ private static boolean tryEncoding (byte sig[], byte b1, byte b2, byte b3) ++ { ++ return ((sig [0] == b1) && (sig [1] == b2) && (sig [2] == b3)); ++ } + + /** + * This method pushes a string back onto input. +@@ -4131,25 +4058,24 @@ + private void popInput () + throws SAXException, IOException + { ++ String uri; + Object input[]; + ++ if (externalEntity != null) ++ uri = externalEntity.getURL ().toString (); ++ else ++ uri = baseURI; + + switch (sourceType) { + +- case INPUT_EXTERNAL: +- if (externalEntity != null) { +- handler.endExternalEntity ( +- externalEntity.getURL ().toString ()); +- } +- break; + case INPUT_STREAM: +- if (baseURI != null) { ++ if (is != null) { + handler.endExternalEntity (baseURI); + } + is.close (); + break; + case INPUT_READER: +- if (baseURI != null) { ++ if (reader != null && uri != null) { + handler.endExternalEntity (baseURI); + } + reader.close (); +@@ -4166,6 +4092,9 @@ + s = (String) entityStack.pop (); + } + ++ input = (Object[]) inputStack.pop (); ++ entityStack.pop (); ++ + sourceType = ((Integer) input [0]).intValue (); + externalEntity = (URLConnection) input [1]; + readBuffer = (char[]) input [2]; +@@ -4807,40 +4736,6 @@ + inCDATA = false; + + symbolTable = new Object [SYMBOL_TABLE_LENGTH][]; +- +-// +- showDocStructure = false; +- translateExternalParsedEntities = false; +- translateComments = false; +- +- try { +- showDocStructure = ((String)handler.getProperty("http://4xt.org/inclusions/showDocStructure")).equals("yes"); +- } catch (SAXNotRecognizedException e) { +- showDocStructure = false; +-// System.err.println(e.toString()); +- } +- +- if (showDocStructure) { +- translateExternalParsedEntities=true; +- translateComments=true; +- } else { +- try { +- translateExternalParsedEntities = ((String)handler.getProperty("http://4xt.org/inclusions/translateExternalParsedEntities")).equals("yes"); +- } catch (SAXNotRecognizedException e) { +- translateExternalParsedEntities = false; +- } +- try { +- translateComments = ((String)handler.getProperty("http://4xt.org/inclusions/translateComments")).equals("yes"); +- } catch (SAXNotRecognizedException e) { +- translateComments = false; +- } +- } +- +- System.err.println("showDocStructure : "+showDocStructure); +- System.err.println("translateExternalParsedEntities : "+translateExternalParsedEntities); +- System.err.println("translateComments : "+translateComments); +-// +- + } + + +@@ -4868,10 +4763,16 @@ + symbolTable = null; + } + ++ /* used to restart reading with some InputStreamReader */ ++ static class EncodingException extends IOException ++ { ++ EncodingException (String encoding) { super (encoding); } ++ } ++ + // + // The current XML handler interface. + // +- private StructSaxDriver handler; ++ private SAXDriver handler; + + // + // I/O information. +@@ -4979,19 +4880,4 @@ + // Utility flag: are we in CDATA? If so, whitespace isn't ignorable. + // + private boolean inCDATA; +-// +- // +- // Flags to define if we should show the document structure and if +- // we should expend the external parsed entities +- // +- private boolean showDocStructure; +- private boolean translateExternalParsedEntities; +- private boolean translateComments; +- +- String version=null; +- String encodingName = null; +- String standalone = null; +-// +- +- + } diff --git a/saxon6.changes b/saxon6.changes index 12ef99e..351096b 100644 --- a/saxon6.changes +++ b/saxon6.changes @@ -1,3 +1,9 @@ +------------------------------------------------------------------- +Tue Dec 4 14:31:30 UTC 2012 - mvyskocil@suse.com + +- fix bnc#739498 - remove GPL covered com/icl/saxon/aelfred/XmlParser + by file unde rmore permissive license + ------------------------------------------------------------------- Thu Nov 29 15:16:28 UTC 2012 - mvyskocil@suse.com diff --git a/saxon6.spec b/saxon6.spec index 57ee7f6..bdd54c9 100644 --- a/saxon6.spec +++ b/saxon6.spec @@ -30,9 +30,13 @@ Source0: http://download.sf.net/saxon/saxon6-5-5.zip Source1: %{name}.saxon.script Source2: %{name}.build.script Source3: %{name}.1 +Source4: http://www.xml.com/2000/08/09/xslt/StructXmlParser.java Patch2: %{name}-cmdlinefix.patch #PATCH-FIX-OPENSUSE: jdk7+ assumes ASCII as default encoding, so iso-8859-1 does not work by default Patch3: saxon-javac-encoding.patch +#PATCH-FIX-OPENSUSE: bnc#739498 - backport changes from com/icl/saxon/aelfred/XmlParser.java +# to older version released under more permissive license +Patch4: saxon-add-fixes-from-com-isl-saxon-aelfred.patch BuildRequires: jpackage-utils >= 1.6 BuildRequires: xml-commons-apis BuildRequires: jdom >= 1.0 @@ -149,8 +153,11 @@ Utility scripts for %{name}. %setup -q -c unzip -q source.zip cp -p %{SOURCE2} ./build.xml +cp -p %{SOURCE4} XmlParser.java %patch2 -p0 %patch3 -p1 +%patch4 -p0 +cp XmlParser.java com/icl/saxon/aelfred/XmlParser.java # cleanup unnecessary stuff we'll build ourselves rm -rf *.jar docs/api