abdera-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jmsn...@apache.org
Subject svn commit: r587550 [4/6] - in /incubator/abdera/java/trunk/extensions/json/src/main: java/nu/ java/nu/validator/ java/nu/validator/htmlparser/ java/nu/validator/htmlparser/common/ java/nu/validator/htmlparser/impl/ java/nu/validator/htmlparser/sax/ ja...
Date Tue, 23 Oct 2007 16:28:58 GMT
Added: incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/TreeBuilder.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/TreeBuilder.java?rev=587550&view=auto
==============================================================================
--- incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/TreeBuilder.java (added)
+++ incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/TreeBuilder.java Tue Oct 23 09:28:51 2007
@@ -0,0 +1,3289 @@
+/*
+ * Copyright (c) 2007 Henri Sivonen
+ * Portions of comments Copyright 2004-2007 Apple Computer, Inc., Mozilla 
+ * Foundation, and Opera Software ASA.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a 
+ * copy of this software and associated documentation files (the "Software"), 
+ * to deal in the Software without restriction, including without limitation 
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+ * and/or sell copies of the Software, and to permit persons to whom the 
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * The comments following this one that use the same comment syntax as this 
+ * comment are quotes from the WHATWG HTML 5 spec as of 27 June 2007 
+ * amended as of June 28 2007.
+ * That document came with this statement:
+ * "© Copyright 2004-2007 Apple Computer, Inc., Mozilla Foundation, and 
+ * Opera Software ASA. You are granted a license to use, reproduce and 
+ * create derivative works of this document."
+ */
+
+package nu.validator.htmlparser.impl;
+
+import java.util.Arrays;
+
+import nu.validator.htmlparser.common.DoctypeExpectation;
+import nu.validator.htmlparser.common.DocumentMode;
+import nu.validator.htmlparser.common.DocumentModeHandler;
+import nu.validator.htmlparser.common.XmlViolationPolicy;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.ErrorHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.SAXParseException;
+
+public abstract class TreeBuilder<T> implements TokenHandler {
+
+    private enum Phase {
+        INITIAL, ROOT_ELEMENT, BEFORE_HEAD, IN_HEAD, IN_HEAD_NOSCRIPT, AFTER_HEAD, IN_BODY, IN_TABLE, IN_CAPTION, IN_COLUMN_GROUP, IN_TABLE_BODY, IN_ROW, IN_CELL, IN_SELECT, AFTER_BODY, IN_FRAMESET, AFTER_FRAMESET, TRAILING_END
+    }
+
+    private class StackNode<S> {
+        final String name;
+
+        final S node;
+        
+        final boolean scoping;
+        
+        final boolean special;
+
+        final boolean fosterParenting;
+        
+        /**
+         * @param name
+         * @param node
+         * @param scoping
+         * @param special
+         */
+        StackNode(final String name, final S node, final boolean scoping, final boolean special, final boolean fosterParenting) {
+            this.name = name;
+            this.node = node;
+            this.scoping = scoping;
+            this.special = special;
+            this.fosterParenting = fosterParenting;
+        }
+
+        /**
+         * @param name
+         * @param node
+         */
+        StackNode(final String name, final S node) {
+            this.name = name;
+            this.node = node;
+            this.scoping = ("table" == name || "caption" == name || "td" == name || "th" == name || "button" == name || "marquee" == name || "object" == name);
+            this.special = ("address" == name || "area" == name || "base" == name || "basefont" == name || "bgsound" == name || "blockquote" == name || "body" == name || "br" == name || "center" == name || "col" == name || "colgroup" == name || "dd" == name || "dir" == name || "div" == name || "dl" == name || "dt" == name || "embed" == name || "fieldset" == name || "form" == name || "frame" == name || "frameset" == name || "h1" == name || "h2" == name || "h3" == name || "h4" == name || "h5" == name || "h6" == name || "head" == name || "hr" == name || "iframe" == name || "image" == name || "img" == name || "input" == name || "isindex" == name || "li" == name || "link" == name || "listing" == name || "menu" == name || "meta" == name || "noembed" == name || "noframes" == name || "noscript" == name || "ol" == name || "optgroup" == name || "option" == name || "p" == name || "param" == name || "plaintext" == name || "pre" == name || "script" == name || "select" == name || "spacer"
  == name || "style" == name || "tbody" == name || "textarea" == name || "tfoot" == name || "thead" == name || "title" == name || "tr" == name || "ul" == name ||  "wbr" == name);
+            this.fosterParenting = ("table" == name || "tbody" == name || "tfoot" == name || "thead" == name || "tr" == name);
+        }
+
+        /**
+         * @see java.lang.Object#toString()
+         */
+        @Override
+        public String toString() {
+            return name;
+        }
+    }
+    
+    private final static char[] ISINDEX_PROMPT = "This is a searchable index. Insert your search keywords here: ".toCharArray();
+
+    private final static String[] QUIRKY_PUBLIC_IDS = {
+            "+//silmaril//dtd html pro v0r11 19970101//en",
+            "-//advasoft ltd//dtd html 3.0 aswedit + extensions//en",
+            "-//as//dtd html 3.0 aswedit + extensions//en",
+            "-//ietf//dtd html 2.0 level 1//en",
+            "-//ietf//dtd html 2.0 level 2//en",
+            "-//ietf//dtd html 2.0 strict level 1//en",
+            "-//ietf//dtd html 2.0 strict level 2//en",
+            "-//ietf//dtd html 2.0 strict//en", "-//ietf//dtd html 2.0//en",
+            "-//ietf//dtd html 2.1e//en", "-//ietf//dtd html 3.0//en",
+            "-//ietf//dtd html 3.0//en//", "-//ietf//dtd html 3.2 final//en",
+            "-//ietf//dtd html 3.2//en", "-//ietf//dtd html 3//en",
+            "-//ietf//dtd html level 0//en",
+            "-//ietf//dtd html level 0//en//2.0",
+            "-//ietf//dtd html level 1//en",
+            "-//ietf//dtd html level 1//en//2.0",
+            "-//ietf//dtd html level 2//en",
+            "-//ietf//dtd html level 2//en//2.0",
+            "-//ietf//dtd html level 3//en",
+            "-//ietf//dtd html level 3//en//3.0",
+            "-//ietf//dtd html strict level 0//en",
+            "-//ietf//dtd html strict level 0//en//2.0",
+            "-//ietf//dtd html strict level 1//en",
+            "-//ietf//dtd html strict level 1//en//2.0",
+            "-//ietf//dtd html strict level 2//en",
+            "-//ietf//dtd html strict level 2//en//2.0",
+            "-//ietf//dtd html strict level 3//en",
+            "-//ietf//dtd html strict level 3//en//3.0",
+            "-//ietf//dtd html strict//en",
+            "-//ietf//dtd html strict//en//2.0",
+            "-//ietf//dtd html strict//en//3.0", "-//ietf//dtd html//en",
+            "-//ietf//dtd html//en//2.0", "-//ietf//dtd html//en//3.0",
+            "-//metrius//dtd metrius presentational//en",
+            "-//microsoft//dtd internet explorer 2.0 html strict//en",
+            "-//microsoft//dtd internet explorer 2.0 html//en",
+            "-//microsoft//dtd internet explorer 2.0 tables//en",
+            "-//microsoft//dtd internet explorer 3.0 html strict//en",
+            "-//microsoft//dtd internet explorer 3.0 html//en",
+            "-//microsoft//dtd internet explorer 3.0 tables//en",
+            "-//netscape comm. corp.//dtd html//en",
+            "-//netscape comm. corp.//dtd strict html//en",
+            "-//o'reilly and associates//dtd html 2.0//en",
+            "-//o'reilly and associates//dtd html extended 1.0//en",
+            "-//spyglass//dtd html 2.0 extended//en",
+            "-//sq//dtd html 2.0 hotmetal + extensions//en",
+            "-//sun microsystems corp.//dtd hotjava html//en",
+            "-//sun microsystems corp.//dtd hotjava strict html//en",
+            "-//w3c//dtd html 3 1995-03-24//en",
+            "-//w3c//dtd html 3.2 draft//en", "-//w3c//dtd html 3.2 final//en",
+            "-//w3c//dtd html 3.2//en", "-//w3c//dtd html 3.2s draft//en",
+            "-//w3c//dtd html 4.0 frameset//en",
+            "-//w3c//dtd html 4.0 transitional//en",
+            "-//w3c//dtd html experimental 19960712//en",
+            "-//w3c//dtd html experimental 970421//en",
+            "-//w3c//dtd w3 html//en", "-//w3o//dtd w3 html 3.0//en",
+            "-//w3o//dtd w3 html 3.0//en//",
+            "-//w3o//dtd w3 html strict 3.0//en//",
+            "-//webtechs//dtd mozilla html 2.0//en",
+            "-//webtechs//dtd mozilla html//en",
+            "-/w3c/dtd html 4.0 transitional/en", "html" };
+
+    private static final int NOT_FOUND_ON_STACK = Integer.MAX_VALUE;
+    
+    private final StackNode<T> MARKER = new StackNode<T>(null, null);
+
+    private final boolean nonConformingAndStreaming;
+
+    private final boolean conformingAndStreaming;
+    
+    private final boolean coalescingText;   
+    
+    private Phase phase = Phase.INITIAL;
+
+    protected Tokenizer tokenizer;
+
+    private ErrorHandler errorHandler;
+
+    private DocumentModeHandler documentModeHandler;
+
+    private DoctypeExpectation doctypeExpectation = DoctypeExpectation.HTML;
+
+    private int cdataOrRcdataTimesToPop;
+
+    private boolean scriptingEnabled = false;
+    
+    private boolean needToDropLF;
+
+    private boolean wantingComments;
+
+    private String context;
+    
+    private Phase previousPhaseBeforeTrailingEnd;
+    
+    private StackNode<T>[] stack;
+    
+    private int currentPtr = -1;
+
+    private StackNode<T>[] listOfActiveFormattingElements;
+
+    private int listPtr = -1;
+    
+    private T formPointer;
+
+    private T headPointer;
+
+    private boolean reportingDoctype = true;
+    
+    private char[] charBuffer;
+    
+    private int charBufferLen = 0;
+    
+    protected TreeBuilder(XmlViolationPolicy streamabilityViolationPolicy, boolean coalescingText) {
+        this.conformingAndStreaming = streamabilityViolationPolicy == XmlViolationPolicy.FATAL;
+        this.nonConformingAndStreaming = streamabilityViolationPolicy == XmlViolationPolicy.ALTER_INFOSET;
+        this.coalescingText = coalescingText;
+        if (coalescingText) {
+            charBuffer = new char[1024];        
+        }
+    }
+    
+    /**
+     * Reports an condition that would make the infoset incompatible with XML
+     * 1.0 as fatal.
+     * 
+     * @throws SAXException
+     * @throws SAXParseException
+     */
+    protected final void fatal() throws SAXException {
+        if (errorHandler == null) {
+            return;
+        }
+        SAXParseException spe = new SAXParseException("Last error required non-streamable recovery.", tokenizer);
+        errorHandler.fatalError(spe);
+        throw spe;
+    }
+    protected final void fatal(Exception e) throws SAXException {
+        SAXParseException spe = new SAXParseException(e.getMessage(), tokenizer, e);;
+        if (errorHandler != null) {
+            errorHandler.fatalError(spe);
+        }
+        throw spe;
+    }
+    
+    /**
+     * Reports a Parse Error.
+     * 
+     * @param message
+     *            the message
+     * @throws SAXException
+     */
+    protected final void err(String message) throws SAXException {
+        if (errorHandler == null) {
+            return;
+        }
+        SAXParseException spe = new SAXParseException(message, tokenizer);
+        errorHandler.error(spe);
+    }
+
+    /**
+     * Reports a warning
+     * 
+     * @param message
+     *            the message
+     * @throws SAXException
+     */
+    protected final void warn(String message) throws SAXException {
+        if (errorHandler == null) {
+            return;
+        }
+        SAXParseException spe = new SAXParseException(message, tokenizer);
+        errorHandler.warning(spe);
+    }
+
+    public final void start(Tokenizer self) throws SAXException {
+        tokenizer = self;
+        stack  = new StackNode[64];
+        listOfActiveFormattingElements  = new StackNode[64];
+        needToDropLF = false;
+        cdataOrRcdataTimesToPop = 0;
+        currentPtr = -1;
+        formPointer = null;
+        wantingComments = wantsComments();
+        start(context != null);
+        if (context == null) {
+            phase = Phase.INITIAL;
+        } else {
+            T elt = createHtmlElementSetAsRoot(tokenizer.newAttributes());
+            StackNode<T> node = new StackNode<T>("html", elt);
+            currentPtr++;
+            stack[currentPtr] = node;
+            resetTheInsertionMode();
+            if ("title" == context || "textarea" == context) {
+                tokenizer.setContentModelFlag(ContentModelFlag.RCDATA, context);
+            } else if ("style" == context || "script" == context || "xmp" == context || "iframe" == context || "noembed" == context || "noframes" == context || (scriptingEnabled && "noscript" == context)) {
+                tokenizer.setContentModelFlag(ContentModelFlag.CDATA, context);                
+            } else if ("plaintext" == context) {
+                tokenizer.setContentModelFlag(ContentModelFlag.PLAINTEXT, context);                       
+            } else {
+                tokenizer.setContentModelFlag(ContentModelFlag.PCDATA, context);                                       
+            }
+        }
+    }
+
+    public final void doctype(String name, String publicIdentifier,
+            String systemIdentifier, boolean correct) throws SAXException {
+        needToDropLF = false;
+        switch (phase) {
+            case INITIAL:
+                /*
+                 * A DOCTYPE token If the DOCTYPE token's name does not
+                 * case-insensitively match the string "HTML", or if the token's
+                 * public identifier is not missing, or if the token's system
+                 * identifier is not missing, then there is a parse error.
+                 * Conformance checkers may, instead of reporting this error,
+                 * switch to a conformance checking mode for another language
+                 * (e.g. based on the DOCTYPE token a conformance checker could
+                 * recognise that the document is an HTML4-era document, and
+                 * defer to an HTML4 conformance checker.)
+                 * 
+                 * Append a DocumentType node to the Document node, with the
+                 * name attribute set to the name given in the DOCTYPE token;
+                 * the publicId attribute set to the public identifier given in
+                 * the DOCTYPE token, or the empty string if the public
+                 * identifier was not set; the systemId attribute set to the
+                 * system identifier given in the DOCTYPE token, or the empty
+                 * string if the system identifier was not set; and the other
+                 * attributes specific to DocumentType objects set to null and
+                 * empty lists as appropriate. Associate the DocumentType node
+                 * with the Document object so that it is returned as the value
+                 * of the doctype attribute of the Document object.
+                 */
+                if (reportingDoctype ) {
+                appendDoctypeToDocument(name, publicIdentifier == null ? ""
+                        : publicIdentifier, systemIdentifier == null ? ""
+                        : systemIdentifier);
+                }
+                /*
+                 * Then, if the DOCTYPE token matches one of the conditions in
+                 * the following list, then set the document to quirks mode:
+                 * 
+                 * Otherwise, if the DOCTYPE token matches one of the conditions
+                 * in the following list, then set the document to limited
+                 * quirks mode: + The public identifier is set to: "-//W3C//DTD
+                 * XHTML 1.0 Frameset//EN" + The public identifier is set to:
+                 * "-//W3C//DTD XHTML 1.0 Transitional//EN" + The system
+                 * identifier is not missing and the public identifier is set
+                 * to: "-//W3C//DTD HTML 4.01 Frameset//EN" + The system
+                 * identifier is not missing and the public identifier is set
+                 * to: "-//W3C//DTD HTML 4.01 Transitional//EN"
+                 * 
+                 * The name, system identifier, and public identifier strings
+                 * must be compared to the values given in the lists above in a
+                 * case-insensitive manner.
+                 */
+                String publicIdentifierLC = toAsciiLowerCase(publicIdentifier);
+                String systemIdentifierLC = toAsciiLowerCase(systemIdentifier);
+                switch (doctypeExpectation) {
+                    case HTML:
+                        if (isQuirky(name, publicIdentifierLC,
+                                systemIdentifierLC, correct)) {
+                            err("Quirky doctype.");
+                            documentModeInternal(DocumentMode.QUIRKS_MODE,
+                                    publicIdentifier, systemIdentifier, false);
+                        } else if (isAlmostStandards(publicIdentifierLC,
+                                systemIdentifierLC)) {
+                            err("Almost standards mode doctype.");
+                            documentModeInternal(DocumentMode.ALMOST_STANDARDS_MODE,
+                                    publicIdentifier, systemIdentifier, false);
+                        } else {
+                            if (!(publicIdentifier == null && systemIdentifier == null)) {
+                                err("Legacy doctype.");
+                            }
+                            documentModeInternal(DocumentMode.STANDARDS_MODE,
+                                    publicIdentifier, systemIdentifier, false);
+                        }
+                        break;
+                    case HTML401_STRICT:
+                        tokenizer.turnOnAdditionalHtml4Errors();
+                        if (isQuirky(name, publicIdentifierLC,
+                                systemIdentifierLC, correct)) {
+                            err("Quirky doctype.");
+                            documentModeInternal(DocumentMode.QUIRKS_MODE,
+                                    publicIdentifier, systemIdentifier, true);
+                        } else if (isAlmostStandards(publicIdentifierLC,
+                                systemIdentifierLC)) {
+                            err("Almost standards mode doctype.");
+                            documentModeInternal(DocumentMode.ALMOST_STANDARDS_MODE,
+                                    publicIdentifier, systemIdentifier, true);
+                        } else {
+                            if ("-//W3C//DTD HTML 4.01//EN".equals(publicIdentifier)) {
+                                if (!"http://www.w3.org/TR/html4/strict.dtd".equals(systemIdentifier)) {
+                                    warn("The doctype did not contain the system identifier prescribed by the HTML 4.01 specification.");
+                                }
+                            } else {
+                                err("The doctype was not the HTML 4.01 Strict doctype.");
+                            }
+                            documentModeInternal(DocumentMode.STANDARDS_MODE,
+                                    publicIdentifier, systemIdentifier, true);
+                        }
+                        break;
+                    case HTML401_TRANSITIONAL:
+                        tokenizer.turnOnAdditionalHtml4Errors();
+                        if (isQuirky(name, publicIdentifierLC,
+                                systemIdentifierLC, correct)) {
+                            err("Quirky doctype.");
+                            documentModeInternal(DocumentMode.QUIRKS_MODE,
+                                    publicIdentifier, systemIdentifier, true);
+                        } else if (isAlmostStandards(publicIdentifierLC,
+                                systemIdentifierLC)) {
+                            if ("-//W3C//DTD HTML 4.01 Transitional//EN".equals(publicIdentifier)
+                                    && systemIdentifier != null) {
+                                if (!"http://www.w3.org/TR/html4/loose.dtd".equals(systemIdentifier)) {
+                                    warn("The doctype did not contain the system identifier prescribed by the HTML 4.01 specification.");
+                                }
+                            } else {
+                                err("The doctype was not a non-quirky HTML 4.01 Transitional doctype.");
+                            }
+                            documentModeInternal(DocumentMode.ALMOST_STANDARDS_MODE,
+                                    publicIdentifier, systemIdentifier, true);
+                        } else {
+                            err("The doctype was not the HTML 4.01 Transitional doctype.");
+                            documentModeInternal(DocumentMode.STANDARDS_MODE,
+                                    publicIdentifier, systemIdentifier, true);
+                        }
+                        break;
+                    case AUTO:
+                        if (isQuirky(name, publicIdentifierLC,
+                                systemIdentifierLC, correct)) {
+                            err("Quirky doctype.");
+                            documentModeInternal(DocumentMode.QUIRKS_MODE,
+                                    publicIdentifier, systemIdentifier, false);
+                        } else if (isAlmostStandards(publicIdentifierLC,
+                                systemIdentifierLC)) {
+                            boolean html4 = "-//W3C//DTD HTML 4.01 Transitional//EN".equals(publicIdentifier);
+                            if (html4) {
+                                tokenizer.turnOnAdditionalHtml4Errors();
+                                if (!"http://www.w3.org/TR/html4/loose.dtd".equals(systemIdentifier)) {
+                                    warn("The doctype did not contain the system identifier prescribed by the HTML 4.01 specification.");
+                                }
+                            } else {
+                                err("Almost standards mode doctype.");
+                            }
+                            documentModeInternal(DocumentMode.ALMOST_STANDARDS_MODE,
+                                    publicIdentifier, systemIdentifier, html4);
+                        } else {
+                            boolean html4 = "-//W3C//DTD HTML 4.01//EN".equals(publicIdentifier);
+                            if (html4) {
+                                tokenizer.turnOnAdditionalHtml4Errors();
+                                if (!"http://www.w3.org/TR/html4/strict.dtd".equals(systemIdentifier)) {
+                                    warn("The doctype did not contain the system identifier prescribed by the HTML 4.01 specification.");
+                                }
+                            } else {
+                                if (!(publicIdentifier == null && systemIdentifier == null)) {
+                                    err("Legacy doctype.");
+                                }
+                            }
+                            documentModeInternal(DocumentMode.STANDARDS_MODE,
+                                    publicIdentifier, systemIdentifier, html4);
+                        }
+                        break;
+                    case NO_DOCTYPE_ERRORS:
+                        if (isQuirky(name, publicIdentifierLC,
+                                systemIdentifierLC, correct)) {
+                            documentModeInternal(DocumentMode.QUIRKS_MODE,
+                                    publicIdentifier, systemIdentifier, false);
+                        } else if (isAlmostStandards(publicIdentifierLC,
+                                systemIdentifierLC)) {
+                            documentModeInternal(DocumentMode.ALMOST_STANDARDS_MODE,
+                                    publicIdentifier, systemIdentifier, false);
+                        } else {
+                            documentModeInternal(DocumentMode.STANDARDS_MODE,
+                                    publicIdentifier, systemIdentifier, false);
+                        }
+                        break;
+                }
+
+                /*
+                 * 
+                 * Then, switch to the root element phase of the tree
+                 * construction stage.
+                 * 
+                 * 
+                 */
+                phase = Phase.ROOT_ELEMENT;
+                return;
+            default:
+                /*
+                 * A DOCTYPE token Parse error.
+                 */
+                err("Stray doctype.");
+                /*
+                 * Ignore the token.
+                 */
+                return;
+        }
+    }
+
+    public final void comment(char[] buf, int length) throws SAXException {
+        needToDropLF = false;
+        if (wantingComments) {
+            switch (phase) {
+                case INITIAL:
+                case ROOT_ELEMENT:
+                case TRAILING_END:
+                    /*
+                     * A comment token Append a Comment node to the Document
+                     * object with the data attribute set to the data given in
+                     * the comment token.
+                     */
+                    appendCommentToDocument(buf, 0, length);
+                    return;
+                case AFTER_BODY:
+                    /*
+                     * * A comment token Append a Comment node to the first
+                     * element in the stack of open elements (the html element),
+                     * with the data attribute set to the data given in the
+                     * comment token.
+                     * 
+                     */
+                    flushCharacters();
+                    appendComment(stack[0].node, buf, 0, length);
+                    return;
+                default:
+                    /*
+                     * * A comment token Append a Comment node to the current
+                     * node with the data attribute set to the data given in the
+                     * comment token.
+                     * 
+                     */
+                    flushCharacters();
+                    appendComment(stack[currentPtr].node, buf, 0, length);
+                    return;
+            }
+        }
+    }
+
+    /**
+     * @see nu.validator.htmlparser.impl.TokenHandler#characters(char[], int, int)
+     */
+    public final void characters(char[] buf, int start, int length)
+            throws SAXException {
+        if (needToDropLF) {
+            if (buf[start] == '\n') {
+                start++;
+                length--;
+                if (length == 0) {
+                    return;
+                }
+            }
+            needToDropLF = false;
+        } else if (cdataOrRcdataTimesToPop > 0) {
+            accumulateCharacters(buf, start, length);
+            return;
+        }
+
+        // optimize the most common case
+        if (phase == Phase.IN_BODY || phase == Phase.IN_CELL
+                || phase == Phase.IN_CAPTION) {
+            reconstructTheActiveFormattingElements();
+            accumulateCharacters(buf, start, length);
+            return;
+        }
+
+        int end = start + length;
+        loop: for (int i = start; i < end; i++) {
+            switch (buf[i]) {
+                case ' ':
+                case '\t':
+                case '\n':
+                case '\u000B':
+                case '\u000C':
+                    /*
+                     * A character token that is one of one of U+0009 CHARACTER
+                     * TABULATION, U+000A LINE FEED (LF), U+000B LINE
+                     * TABULATION, U+000C FORM FEED (FF), or U+0020 SPACE
+                     */
+                    switch (phase) {
+                        case INITIAL:
+                        case ROOT_ELEMENT:
+                            /*
+                             * Ignore the token.
+                             */
+                            start = i + 1;
+                            continue;
+                        case BEFORE_HEAD:
+                        case IN_HEAD:
+                        case IN_HEAD_NOSCRIPT:
+                        case AFTER_HEAD:
+                        case IN_TABLE:
+                        case IN_COLUMN_GROUP:
+                        case IN_TABLE_BODY:
+                        case IN_ROW:
+                        case IN_FRAMESET:
+                        case AFTER_FRAMESET:
+                            /*
+                             * Append the character to the current node.
+                             */
+                            continue;
+                        case IN_BODY:
+                        case IN_CELL:
+                        case IN_CAPTION:
+                            // XXX is this dead code?
+                            if (start < i) {
+                                accumulateCharacters(buf, start, i
+                                        - start);
+                                start = i;
+                            }
+
+                            /*
+                             * Reconstruct the active formatting elements, if
+                             * any.
+                             */
+                            reconstructTheActiveFormattingElements();
+                            /* Append the token's character to the current node. */
+                            break loop;
+                        case IN_SELECT:
+                            break loop;
+                        case AFTER_BODY:
+                            if (start < i) {
+                                accumulateCharacters(buf, start, i
+                                        - start);
+                                start = i;
+                            }
+                            /*
+                             * Reconstruct the active formatting elements, if
+                             * any.
+                             */
+                            // XXX bug?
+                            reconstructTheActiveFormattingElements();
+                            /* Append the token's character to the current node. */
+                            continue;
+                        case TRAILING_END:
+                            if (conformingAndStreaming) {
+                                return;
+                            }
+                            if (previousPhaseBeforeTrailingEnd == Phase.AFTER_FRAMESET) {
+                                continue;
+                            } else {
+                                if (start < i) {
+                                    accumulateCharacters(buf, start, i
+                                            - start);
+                                    start = i;
+                                }
+                                /*
+                                 * Reconstruct the active formatting elements,
+                                 * if any.
+                                 */
+                                // XXX bug?
+                                reconstructTheActiveFormattingElements();
+                                /*
+                                 * Append the token's character to the current
+                                 * node.
+                                 */
+                                continue;
+                            }
+                    }
+                default:
+                    /*
+                     * A character token that is not one of one of U+0009
+                     * CHARACTER TABULATION, U+000A LINE FEED (LF), U+000B LINE
+                     * TABULATION, U+000C FORM FEED (FF), or U+0020 SPACE
+                     */
+                    switch (phase) {
+                        case INITIAL:
+                            /*
+                             * Parse error.
+                             */
+                            if (doctypeExpectation != DoctypeExpectation.NO_DOCTYPE_ERRORS) {
+                                err("Non-space characters found without seeing a doctype first.");
+                            }
+                            /*
+                             * 
+                             * Set the document to quirks mode.
+                             */
+                            documentModeInternal(DocumentMode.QUIRKS_MODE, null, null,
+                                    false);
+                            /*
+                             * Then, switch to the root element phase of the
+                             * tree construction stage
+                             */
+                            phase = Phase.ROOT_ELEMENT;
+                            /*
+                             * and reprocess the current token.
+                             * 
+                             * 
+                             */
+                            i--;
+                            continue;
+                        case ROOT_ELEMENT:
+                            /*
+                             * Create an HTMLElement node with the tag name
+                             * html, in the HTML namespace. Append it to the
+                             * Document object.
+                             */
+                            appendHtmlElementToDocumentAndPush();
+                            /* Switch to the main phase */
+                            phase = Phase.BEFORE_HEAD;
+                            /*
+                             * reprocess the current token.
+                             * 
+                             */
+                            i--;
+                            continue;
+                        case BEFORE_HEAD:
+                            if (start < i) {
+                                accumulateCharacters(buf, start, i
+                                        - start);
+                                start = i;
+                            }
+                            /*
+                             * /*Act as if a start tag token with the tag name
+                             * "head" and no attributes had been seen,
+                             */
+                            appendToCurrentNodeAndPushHeadElement(EmptyAttributes.EMPTY_ATTRIBUTES);
+                            phase = Phase.IN_HEAD;
+                            /*
+                             * then reprocess the current token.
+                             * 
+                             * This will result in an empty head element being
+                             * generated, with the current token being
+                             * reprocessed in the "after head" insertion mode.
+                             */
+                            i--;
+                            continue;
+                        case IN_HEAD:
+                            if (start < i) {
+                                accumulateCharacters(buf, start, i
+                                        - start);
+                                start = i;
+                            }
+                            /*
+                             * Act as if an end tag token with the tag name
+                             * "head" had been seen,
+                             */
+                            pop();
+                            phase = Phase.AFTER_HEAD;
+                            /*
+                             * and reprocess the current token.
+                             */
+                            i--;
+                            continue;
+                        case IN_HEAD_NOSCRIPT:
+                            if (start < i) {
+                                accumulateCharacters(buf, start, i
+                                        - start);
+                                start = i;
+                            }
+                            /*
+                             * Parse error. Act as if an end tag with the tag
+                             * name "noscript" had been seen
+                             */
+                            err("Non-space character inside \u201Cnoscript\u201D inside \u201Chead\u201D.");
+                            pop();
+                            phase = Phase.IN_HEAD;
+                            /*
+                             * and reprocess the current token.
+                             */
+                            i--;
+                            continue;
+                        case AFTER_HEAD:
+                            if (start < i) {
+                                accumulateCharacters(buf, start, i
+                                        - start);
+                                start = i;
+                            }
+                            /*
+                             * Act as if a start tag token with the tag name
+                             * "body" and no attributes had been seen,
+                             */
+                            appendToCurrentNodeAndPushBodyElement();
+                            phase = Phase.IN_BODY;
+                            /*
+                             * and then reprocess the current token.
+                             */
+                            i--;
+                            continue;
+                        case IN_BODY:
+                        case IN_CELL:
+                        case IN_CAPTION:
+                            if (start < i) {
+                                accumulateCharacters(buf, start, i
+                                        - start);
+                                start = i;
+                            }
+                            /*
+                             * Reconstruct the active formatting elements, if
+                             * any.
+                             */
+                            reconstructTheActiveFormattingElements();
+                            /* Append the token's character to the current node. */
+                            break loop;
+                        case IN_TABLE:
+                        case IN_TABLE_BODY:
+                        case IN_ROW:
+                            if (start < i) {
+                                accumulateCharacters(buf, start, i
+                                        - start);
+                            }
+                            reconstructTheActiveFormattingElements();
+                            appendCharMayFoster(buf, i);
+                            start = i + 1;
+                            continue;
+                        case IN_COLUMN_GROUP:
+                            /*
+                             * Act as if an end tag with the tag name "colgroup"
+                             * had been seen, and then, if that token wasn't
+                             * ignored, reprocess the current token.
+                             */
+                            if (currentPtr == 0) {
+                                err("Non-space in \u201Ccolgroup\u201D when parsing fragment.");
+                                continue;
+                            }
+                            pop();
+                            phase = Phase.IN_TABLE;
+                            i--;
+                            continue;
+                        case IN_SELECT:
+                            break loop;
+                        case AFTER_BODY:
+                            err("Non-space character after body.");
+                            if (conformingAndStreaming) {
+                                fatal();
+                            }
+                            phase = Phase.IN_BODY;
+                            i--;
+                            continue;
+                        case IN_FRAMESET:
+                            if (start < i) {
+                                accumulateCharacters(buf, start, i
+                                        - start);
+                                start = i;
+                            }
+                            /*
+                             * Parse error.
+                             */
+                            err("Non-space in \u201Cframeset\u201D.");
+                            /*
+                             * Ignore the token.
+                             */
+                            start = i + 1;
+                            continue;
+                        case AFTER_FRAMESET:
+                            if (start < i) {
+                                accumulateCharacters(buf, start, i
+                                        - start);
+                                start = i;
+                            }
+                            /*
+                             * Parse error.
+                             */
+                            err("Non-space after \u201Cframeset\u201D.");
+                            /*
+                             * Ignore the token.
+                             */
+                            start = i + 1;
+                            continue;
+                        case TRAILING_END:
+                            /*
+                             * Parse error.
+                             */
+                            err("Non-space character in page trailer.");
+                            if (conformingAndStreaming) {
+                                fatal();
+                            }
+                            /*
+                             * Switch back to the main phase and reprocess the
+                             * token.
+                             */
+                            phase = previousPhaseBeforeTrailingEnd;
+                            i--;
+                            continue;
+                    }
+            }
+        }
+        if (start < end) {
+            accumulateCharacters(buf, start, end - start);
+        }
+    }
+
+    public final void eof() throws SAXException {
+        try {
+            flushCharacters();
+            eofloop: for (;;) {
+                switch (phase) {
+                    case INITIAL:
+                        /*
+                         * Parse error.
+                         */
+                        if (doctypeExpectation != DoctypeExpectation.NO_DOCTYPE_ERRORS) {
+                            err("End of file seen without seeing a doctype first.");
+                        }
+                        /*
+                         * 
+                         * Set the document to quirks mode.
+                         */
+                        documentModeInternal(DocumentMode.QUIRKS_MODE, null, null,
+                                false);
+                        /*
+                         * Then, switch to the root element phase of the tree
+                         * construction stage
+                         */
+                        phase = Phase.ROOT_ELEMENT;
+                        /*
+                         * and reprocess the current token.
+                         */
+                        continue;
+                    case ROOT_ELEMENT:
+                        /*
+                         * Create an HTMLElement node with the tag name html, in
+                         * the HTML namespace. Append it to the Document object.
+                         */
+                        appendHtmlElementToDocumentAndPush();
+                        /* Switch to the main phase */
+                        phase = Phase.BEFORE_HEAD;
+                        /*
+                         * reprocess the current token.
+                         */
+                        continue;
+                    case BEFORE_HEAD:
+                        appendToCurrentNodeAndPushHeadElement(EmptyAttributes.EMPTY_ATTRIBUTES);
+                        phase = Phase.IN_HEAD;
+                        continue;
+                    case IN_HEAD:
+                        if (currentPtr > 1) {
+                            err("End of file seen and there were open elements.");
+                        }
+                        while (currentPtr > 0) {
+                            pop();
+                        }
+                        phase = Phase.AFTER_HEAD;
+                        continue;
+                    case IN_HEAD_NOSCRIPT:
+                        err("End of file seen and there were open elements.");
+                        while (currentPtr > 1) {
+                            pop();
+                        }
+                        phase = Phase.IN_HEAD;
+                        continue;
+                    case AFTER_HEAD:
+                        appendToCurrentNodeAndPushBodyElement();
+                        phase = Phase.IN_BODY;
+                        continue;
+                    case IN_BODY:
+                    case IN_TABLE:
+                    case IN_CAPTION:
+                    case IN_COLUMN_GROUP:
+                    case IN_TABLE_BODY:
+                    case IN_ROW:
+                    case IN_CELL:
+                    case IN_SELECT:
+                        /*
+                         * Generate implied end tags.
+                         */
+                        generateImpliedEndTags();
+                        /*
+                         * If there are more than two nodes on the stack of open
+                         * elements,
+                         */
+                        if (currentPtr > 1) {
+                            err("End of file seen and there were open elements.");
+                        } else if (currentPtr == 1 && stack[1].name != "body") {
+                            /*
+                             * or if there are two nodes but the second node is
+                             * not a body node, this is a parse error.
+                             */
+                            err("End of file seen and there were open elements.");
+                        }
+                        if (context != null) {
+                            if (currentPtr > 0 && stack[1].name != "body") {
+                                /*
+                                 * Otherwise, if the parser was originally
+                                 * created as part of the HTML fragment parsing
+                                 * algorithm, and there's more than one element
+                                 * in the stack of open elements, and the second
+                                 * node on the stack of open elements is not a
+                                 * body node, then this is a parse error.
+                                 * (fragment case)
+                                 */
+                                err("End of file seen and there were open elements.");
+                            }
+                        }
+
+                        /* Stop parsing. */
+                        if (context == null) {
+                            bodyClosed(stack[1].node);
+                        }
+                        phase = Phase.AFTER_BODY;
+                        continue;
+                    /*
+                     * This fails because it doesn't imply HEAD and BODY tags.
+                     * We should probably expand out the insertion modes and
+                     * merge them with phases and then put the three things here
+                     * into each insertion mode instead of trying to factor them
+                     * out so carefully.
+                     * 
+                     */
+                    case IN_FRAMESET:
+                        err("End of file seen and there were open elements.");
+                        break eofloop;                        
+                    case AFTER_BODY:
+                    case AFTER_FRAMESET:
+                        if (context == null) {
+                            htmlClosed(stack[0].node);
+                        }
+                    case TRAILING_END:
+                        break eofloop;                        
+                }
+            }
+        } finally {
+            // XXX close elts for SAX
+            /* Stop parsing. */
+            stack = null;
+            listOfActiveFormattingElements = null;
+            end();
+        }
+    }
+
+    public final void startTag(String name, Attributes attributes)
+            throws SAXException {
+        needToDropLF = false;
+        for (;;) {
+            switch (phase) {
+                case IN_TABLE_BODY:
+                    if ("tr" == name) {
+                        clearStackBackTo(findLastInTableScopeOrRootTbodyTheadTfoot());
+                        appendToCurrentNodeAndPushElement(name, attributes);
+                        phase = Phase.IN_ROW;
+                        return;
+                    } else if ("td" == name || "th" == name) {
+                        err("\u201C" + name + "\u201D start tag in table body.");
+                        clearStackBackTo(findLastInTableScopeOrRootTbodyTheadTfoot());
+                        appendToCurrentNodeAndPushElement("tr",
+                                EmptyAttributes.EMPTY_ATTRIBUTES);
+                        phase = Phase.IN_ROW;
+                        continue;
+                    } else if ("caption" == name || "col" == name
+                            || "colgroup" == name || "tbody" == name
+                            || "tfoot" == name || "thead" == name) {
+                        int eltPos = findLastInTableScopeOrRootTbodyTheadTfoot();
+                        if (eltPos == 0) {
+                            err("Stray \u201C" + name + "\u201D start tag.");
+                            return;
+                        } else {
+                            clearStackBackTo(eltPos);
+                            pop();
+                            phase = Phase.IN_TABLE;
+                            continue;
+                        }
+                    } else {
+                        // fall through to IN_TABLE
+                    }
+                case IN_ROW:
+                    if ("td" == name || "th" == name) {
+                        clearStackBackTo(findLastOrRoot("tr"));
+                        appendToCurrentNodeAndPushElement(name, attributes);
+                        phase = Phase.IN_CELL;
+                        insertMarker();
+                        return;
+                    } else if ("caption" == name || "col" == name
+                            || "colgroup" == name || "tbody" == name
+                            || "tfoot" == name || "thead" == name
+                            || "tr" == name) {
+                        int eltPos = findLastOrRoot("tr");
+                        if (eltPos == 0) {
+                            assert context != null;
+                            err("No table row to close.");
+                            return;
+                        }
+                        clearStackBackTo(eltPos);
+                        pop();
+                        phase = Phase.IN_TABLE_BODY;
+                        continue;
+                    } else {
+                        // fall through to IN_TABLE
+                    }
+                case IN_TABLE:
+                    if ("caption" == name) {
+                        clearStackBackTo(findLastOrRoot("table"));
+                        insertMarker();
+                        appendToCurrentNodeAndPushElement(name, attributes);
+                        phase = Phase.IN_CAPTION;
+                        return;
+                    } else if ("colgroup" == name) {
+                        clearStackBackTo(findLastOrRoot("table"));
+                        appendToCurrentNodeAndPushElement(name, attributes);
+                        phase = Phase.IN_COLUMN_GROUP;
+                        return;
+                    } else if ("col" == name) {
+                        clearStackBackTo(findLastOrRoot("table"));
+                        appendToCurrentNodeAndPushElement("colgroup",
+                                EmptyAttributes.EMPTY_ATTRIBUTES);
+                        phase = Phase.IN_COLUMN_GROUP;
+                        continue;
+                    } else if ("tbody" == name || "tfoot" == name
+                            || "thead" == name) {
+                        clearStackBackTo(findLastOrRoot("table"));
+                        appendToCurrentNodeAndPushElement(name, attributes);
+                        phase = Phase.IN_TABLE_BODY;
+                        return;
+                    } else if ("td" == name || "tr" == name || "th" == name) {
+                        clearStackBackTo(findLastOrRoot("table"));
+                        appendToCurrentNodeAndPushElement("tbody",
+                                EmptyAttributes.EMPTY_ATTRIBUTES);
+                        phase = Phase.IN_TABLE_BODY;
+                        continue;
+                    } else if ("table" == name) {
+                        err("Start tag for \u201Ctable\u201D seen but the previous \u201Ctable\u201D is still open.");
+                        int eltPos = findLastInTableScope(name);
+                        if (eltPos == NOT_FOUND_ON_STACK) {
+                            assert context != null;
+                            return;
+                        }
+                        generateImpliedEndTags();
+                        // XXX is the next if dead code?
+                        if (!isCurrent("table")) {
+                            err("Unclosed elements on stack.");
+                        }
+                        while (currentPtr >= eltPos) {
+                            pop();
+                        }
+                        resetTheInsertionMode();
+                        continue;
+                    } else {
+                        err("Start tag \u201C" + name
+                                + "\u201D seen in \u201Ctable\u201D.");
+                        // fall through to IN_BODY
+                    }
+                case IN_CAPTION:
+                    if ("caption" == name || "col" == name
+                            || "colgroup" == name || "tbody" == name
+                            || "td" == name || "tfoot" == name || "th" == name
+                            || "thead" == name || "tr" == name) {
+                        err("Stray \u201C" + name
+                                + "\u201D start tag in \u201Ccaption\u201D.");
+                        int eltPos = findLastInTableScope("caption");
+                        if (eltPos == NOT_FOUND_ON_STACK) {
+                            return;
+                        }
+                        generateImpliedEndTags();
+                        if (currentPtr != eltPos) {
+                            err("Unclosed elements on stack.");
+                        }
+                        while (currentPtr >= eltPos) {
+                            pop();
+                        }
+                        clearTheListOfActiveFormattingElementsUpToTheLastMarker();
+                        phase = Phase.IN_TABLE;
+                        continue;
+                    } else {
+                        // fall through to IN_BODY
+                    }
+                case IN_CELL:
+                    if ("caption" == name || "col" == name
+                            || "colgroup" == name || "tbody" == name
+                            || "td" == name || "tfoot" == name || "th" == name
+                            || "thead" == name || "tr" == name) {
+                        int eltPos = findLastInTableScopeTdTh();
+                        if (eltPos == NOT_FOUND_ON_STACK) {
+                            err("No cell to close.");
+                            return;
+                        } else {
+                            closeTheCell(eltPos);
+                            continue;
+                        }
+                    } else {
+                        // fall through to IN_BODY
+                    }
+                case IN_BODY:
+                    if ("html" == name) {
+                        err("Stray \u201Chtml\u201D start tag.");
+                        addAttributesToElement(stack[0].node, attributes);
+                        return;
+                    } else if ("base" == name || "link" == name || "meta" == name
+                            || "style" == name || "script" == name) {
+                        // Fall through to IN_HEAD
+                    } else if ("title" == name) {
+                        err("\u201Ctitle\u201D element found inside \u201Cbody\u201D.");
+                        if (!nonConformingAndStreaming) {
+                            pushHeadPointerOntoStack();
+                        }
+                        appendToCurrentNodeAndPushElementMayFoster(name, attributes);
+                        cdataOrRcdataTimesToPop = nonConformingAndStreaming ? 1
+                                : 2; // pops head
+                        tokenizer.setContentModelFlag(ContentModelFlag.RCDATA,
+                                name);
+                        return;
+                    } else if ("body" == name) {
+                        err("\u201Cbody\u201D start tag found but the \u201Cbody\u201D element is already open.");
+                        addAttributesToBody(attributes);
+                        return;
+                    } else if ("p" == name || "div" == name || "h1" == name
+                            || "h2" == name || "h3" == name || "h4" == name
+                            || "h5" == name || "h6" == name
+                            || "blockquote" == name || "ol" == name
+                            || "ul" == name || "dl" == name
+                            || "fieldset" == name || "address" == name
+                            || "menu" == name || "center" == name
+                            || "dir" == name || "listing" == name) {
+                        implicitlyCloseP();
+                        appendToCurrentNodeAndPushElementMayFoster(name, attributes);
+                        return;
+                    } else if ("pre" == name) {
+                        implicitlyCloseP();
+                        appendToCurrentNodeAndPushElementMayFoster(name, attributes);
+                        needToDropLF = true;
+                        return;
+                    } else if ("form" == name) {
+                        if (formPointer != null) {
+                            err("Saw a \u201Cform\u201D start tag, but there was already an active \u201Cform\u201D element.");
+                            return;
+                        } else {
+                            implicitlyCloseP();
+                            appendToCurrentNodeAndPushFormElementMayFoster(attributes);
+                            return;
+                        }
+                    } else if ("li" == name) {
+                        implicitlyCloseP();
+                        int eltPos = findLiToPop();
+                        if (eltPos < currentPtr) {
+                            err("A \u201Cli\u201D start tag was seen but the previous \u201Cli\u201D element had open children.");
+                        }
+                        while (currentPtr >= eltPos) {
+                            pop();
+                        }
+                        appendToCurrentNodeAndPushElementMayFoster(name, attributes);
+                        return;
+                    } else if ("dd" == name || "dt" == name) {
+                        implicitlyCloseP();
+                        int eltPos = findDdOrDtToPop();
+                        if (eltPos < currentPtr) {
+                            err("A definition list item start tag was seen but the previous definition list item element had open children.");
+                        }
+                        while (currentPtr >= eltPos) {
+                            pop();
+                        }
+                        appendToCurrentNodeAndPushElementMayFoster(name, attributes);
+                        return;
+                    } else if ("plaintext" == name) {
+                        implicitlyCloseP();
+                        appendToCurrentNodeAndPushElementMayFoster(name, attributes);
+                        tokenizer.setContentModelFlag(
+                                ContentModelFlag.PLAINTEXT, name);
+                        return;
+                    } else if ("a" == name) {
+                        int activeAPos = findInListOfActiveFormattingElementsContainsBetweenEndAndLastMarker("a");
+                        if (activeAPos != -1) {
+                            err("An \u201Ca\u201D start tag seen with already an active \u201Ca\u201D element.");
+                            StackNode<T> activeA = listOfActiveFormattingElements[activeAPos];
+                            adoptionAgencyEndTag("a");
+                            removeFromStack(activeA);
+                            activeAPos = findInListOfActiveFormattingElements(activeA);
+                            if (activeAPos != -1) {
+                                removeFromListOfActiveFormattingElements(activeAPos);
+                            }
+                        }
+                        reconstructTheActiveFormattingElements();
+                        appendToCurrentNodeAndPushFormattingElementMayFoster(name,
+                                attributes);
+                        return;
+                    } else if ("i" == name || "b" == name || "em" == name
+                            || "strong" == name || "font" == name
+                            || "big" == name || "s" == name || "small" == name
+                            || "strike" == name || "tt" == name || "u" == name) {
+                        reconstructTheActiveFormattingElements();
+                        appendToCurrentNodeAndPushFormattingElementMayFoster(name,
+                                attributes);
+                        return;
+                    } else if ("nobr" == name) {
+                        reconstructTheActiveFormattingElements();
+                        if (NOT_FOUND_ON_STACK != findLastInScope("nobr")) {
+                            err("\u201Cnobr\u201D start tag seen when there was an open \u201Cnobr\u201D element in scope.");
+                            adoptionAgencyEndTag("nobr");
+                        }
+                        appendToCurrentNodeAndPushFormattingElementMayFoster(name,
+                                attributes);
+                        return;
+                    } else if ("button" == name) {
+                        int eltPos = findLastInScope(name);
+                        if (eltPos != NOT_FOUND_ON_STACK) {
+                            err("\u201Cbutton\u201D start tag seen when there was an open \u201Cbutton\u201D element in scope.");
+                            generateImpliedEndTags();
+                            if (!isCurrent("button")) {
+                                err("There was an open \u201Cbutton\u201D element in scope with unclosed children.");
+                            }
+                            while (currentPtr >= eltPos) {
+                                pop();
+                            }
+                            clearTheListOfActiveFormattingElementsUpToTheLastMarker();
+                            continue;
+                        } else {
+                            reconstructTheActiveFormattingElements();
+                            // XXX form
+                            appendToCurrentNodeAndPushElementMayFoster(name, attributes);
+                            insertMarker();
+                            return;
+                        }
+                    } else if ("object" == name || "marquee" == name) {
+                        reconstructTheActiveFormattingElements();
+                        appendToCurrentNodeAndPushElementMayFoster(name, attributes);
+                        insertMarker();
+                        return;
+                    } else if ("xmp" == name) {
+                        reconstructTheActiveFormattingElements();
+                        appendToCurrentNodeAndPushElementMayFoster(name, attributes);
+                        cdataOrRcdataTimesToPop = 1;
+                        tokenizer.setContentModelFlag(ContentModelFlag.CDATA,
+                                name);
+                        return;
+                    } else if ("table" == name) {
+                        implicitlyCloseP();
+                        appendToCurrentNodeAndPushElementMayFoster(name, attributes);
+                        phase = Phase.IN_TABLE;
+                        return;
+                    } else if ("br" == name || "img" == name || "embed" == name
+                            || "param" == name || "area" == name
+                            || "basefont" == name || "bgsound" == name
+                            || "spacer" == name || "wbr" == name) {
+                        reconstructTheActiveFormattingElements();
+                        appendVoidElementToCurrentMayFoster(name, attributes);
+                        return;
+                    } else if ("hr" == name) {
+                        implicitlyCloseP();
+                        appendVoidElementToCurrentMayFoster(name, attributes);
+                        return;
+                    } else if ("image" == name) {
+                        err("Saw a start tag \u201Cimage\u201D.");
+                        name = "img";
+                        continue;
+                    } else if ("input" == name) {
+                        reconstructTheActiveFormattingElements();
+                        appendVoidElementToCurrentMayFoster(name, attributes, formPointer);
+                        return;
+                    } else if ("isindex" == name) {
+                        err("\u201Cisindex\u201D seen.");
+                        if (formPointer != null) {
+                            return;
+                        }
+                        implicitlyCloseP();
+                        AttributesImpl formAttrs = tokenizer.newAttributes();
+                        int actionIndex = attributes.getIndex("action");
+                        if (actionIndex > -1) {
+                            formAttrs.addAttribute("action",
+                                    attributes.getValue(actionIndex));
+                        }
+                        appendToCurrentNodeAndPushFormElementMayFoster(formAttrs);
+                        appendVoidElementToCurrentMayFoster("hr", EmptyAttributes.EMPTY_ATTRIBUTES);
+                        appendToCurrentNodeAndPushElementMayFoster("p",
+                                EmptyAttributes.EMPTY_ATTRIBUTES);
+                        appendToCurrentNodeAndPushElementMayFoster("label",
+                                EmptyAttributes.EMPTY_ATTRIBUTES);
+                        int promptIndex = attributes.getIndex("prompt");
+                        if (promptIndex > -1) {
+                            char[] prompt = attributes.getValue(promptIndex).toCharArray();
+                            appendCharacters(stack[currentPtr].node, prompt,
+                                    0, prompt.length);
+                        } else {
+                            // XXX localization
+                            appendCharacters(stack[currentPtr].node, ISINDEX_PROMPT,
+                                    0, ISINDEX_PROMPT.length);
+                        }
+                        AttributesImpl inputAttributes = tokenizer.newAttributes();
+                        inputAttributes.addAttribute("name", "isindex");
+                        for (int i = 0; i < attributes.getLength(); i++) {
+                            String attributeQName = attributes.getQName(i);
+                            if (!("name".equals(attributeQName)
+                                    || "action".equals(attributeQName) || "prompt".equals(attributeQName))) {
+                                inputAttributes.addAttribute(attributeQName,
+                                        attributes.getValue(i));
+                            }
+                        }
+                        appendVoidElementToCurrentMayFoster("input", inputAttributes, formPointer);
+                        // XXX localization
+                        pop(); // label
+                        pop(); // p
+                        appendVoidElementToCurrentMayFoster("hr", EmptyAttributes.EMPTY_ATTRIBUTES);
+                        pop(); // form
+                        return;
+                    } else if ("textarea" == name) {
+                        appendToCurrentNodeAndPushElementMayFoster(name, attributes, formPointer);
+                        tokenizer.setContentModelFlag(ContentModelFlag.RCDATA,
+                                name);
+                        cdataOrRcdataTimesToPop = 1;
+                        needToDropLF = true;
+                        return;
+                    } else if ("iframe" == name || "noembed" == name
+                            || "noframes" == name
+                            || ("noscript" == name && scriptingEnabled)) {
+                        appendToCurrentNodeAndPushElementMayFoster(name, attributes);
+                        cdataOrRcdataTimesToPop = 1;
+                        tokenizer.setContentModelFlag(ContentModelFlag.CDATA,
+                                name);
+                        return;
+                    } else if ("select" == name) {
+                        reconstructTheActiveFormattingElements();
+                        // XXX form pointer
+                        appendToCurrentNodeAndPushElementMayFoster(name, attributes);
+                        phase = Phase.IN_SELECT;
+                        return;
+                    } else if ("caption" == name || "col" == name
+                            || "colgroup" == name || "frame" == name
+                            || "frameset" == name || "head" == name
+                            || "option" == name || "optgroup" == name
+                            || "tbody" == name || "td" == name
+                            || "tfoot" == name || "th" == name
+                            || "thead" == name || "tr" == name) {
+                        err("Stray start tag \u201C" + name + "\u201D.");
+                        return;
+                    } else {
+                        reconstructTheActiveFormattingElements();
+                        appendToCurrentNodeAndPushElementMayFoster(name, attributes);
+                        return;
+                    }
+                case IN_HEAD:
+                    if ("html" == name) {
+                        err("Stray \u201Chtml\u201D start tag.");
+                        addAttributesToElement(stack[0].node, attributes);
+                        return;
+                    } else if ("base" == name) {
+                        appendVoidElementToCurrentMayFoster(name, attributes);
+                        return;
+                    } else if ("meta" == name || "link" == name) {
+                        // Fall through to IN_HEAD_NOSCRIPT
+                    } else if ("title" == name) {
+                        appendToCurrentNodeAndPushElement(name, attributes);
+                        cdataOrRcdataTimesToPop = 1;
+                        tokenizer.setContentModelFlag(ContentModelFlag.RCDATA,
+                                name);
+                        return;
+                    } else if ("style" == name
+                            || ("noscript" == name && scriptingEnabled)) {
+                        appendToCurrentNodeAndPushElement(name, attributes);
+                        cdataOrRcdataTimesToPop = 1;
+                        tokenizer.setContentModelFlag(ContentModelFlag.CDATA,
+                                name);
+                        return;
+                    } else if ("noscript" == name && !scriptingEnabled) {
+                        appendToCurrentNodeAndPushElement(name, attributes);
+                        phase = Phase.IN_HEAD_NOSCRIPT;
+                        return;
+                    } else if ("script" == name) {
+                        // XXX need to manage much more stuff here if supporting
+                        // document.write()
+                        appendToCurrentNodeAndPushElement(name, attributes);
+                        cdataOrRcdataTimesToPop = 1;
+                        tokenizer.setContentModelFlag(ContentModelFlag.CDATA,
+                                name);
+                        return;
+                    } else if ("head" == name) {
+                        /* Parse error. */
+                        err("Start tag for \u201Chead\u201D seen when \u201Chead\u201D was already open.");
+                        /* Ignore the token. */
+                        return;
+                    } else {
+                        pop();
+                        phase = Phase.AFTER_HEAD;
+                        continue;
+                    }
+                case IN_HEAD_NOSCRIPT:
+                    // XXX did Hixie really mean to omit "base" here?
+                    if ("html" == name) {
+                        err("Stray \u201Chtml\u201D start tag.");
+                        addAttributesToElement(stack[0].node, attributes);
+                        return;
+                    } else if ("link" == name) {
+                        appendVoidElementToCurrentMayFoster(name, attributes);
+                        return;
+                    } else if ("meta" == name) {
+                        // XXX do charset stuff
+                        appendVoidElementToCurrentMayFoster(name, attributes);
+                        return;
+                    } else if ("style" == name) {
+                        appendToCurrentNodeAndPushElement(name, attributes);
+                        cdataOrRcdataTimesToPop = 1;
+                        tokenizer.setContentModelFlag(ContentModelFlag.CDATA,
+                                name);
+                        return;
+                    } else if ("head" == name) {
+                        err("Start tag for \u201Chead\u201D seen when \u201Chead\u201D was already open.");
+                        return;
+                    } else if ("noscript" == name) {
+                        err("Start tag for \u201Cnoscript\u201D seen when \u201Cnoscript\u201D was already open.");
+                        return;
+                    } else {
+                        err("Bad start tag in \u201Cnoscript\u201D in \u201Chead\u201D.");
+                        pop();
+                        phase = Phase.IN_HEAD;
+                        continue;
+                    }
+                case IN_COLUMN_GROUP:
+                    if ("html" == name) {
+                        err("Stray \u201Chtml\u201D start tag.");
+                        addAttributesToElement(stack[0].node, attributes);
+                        return;
+                    } else if ("col" == name) {
+                        appendVoidElementToCurrentMayFoster(name, attributes);
+                        return;
+                    } else {
+                        if (currentPtr == 0) {
+                            assert context != null;
+                            err("Garbage in \u201Ccolgroup\u201D fragment.");
+                            return;
+                        }
+                        pop();
+                        phase = Phase.IN_TABLE;
+                        continue;
+                    }
+                case IN_SELECT:
+                    if ("html" == name) {
+                        err("Stray \u201Chtml\u201D start tag.");
+                        addAttributesToElement(stack[0].node, attributes);
+                        return;
+                    } else if ("option" == name) {
+                        if (isCurrent("option")) {
+                            pop();
+                        }
+                        appendToCurrentNodeAndPushElement(name, attributes);
+                        return;
+                    } else if ("optgroup" == name) {
+                        if (isCurrent("option")) {
+                            pop();
+                        }
+                        if (isCurrent("optgroup")) {
+                            pop();
+                        }
+                        appendToCurrentNodeAndPushElement(name, attributes);
+                        return;
+                    } else if ("select" == name) {
+                        err("\u201Cselect\u201D start tag where end tag expected.");
+                        int eltPos = findLastInTableScope(name);
+                        if (eltPos == NOT_FOUND_ON_STACK) {
+                            assert context != null;
+                            err("No \u201Cselect\u201D in table scope.");
+                            return;
+                        } else {
+                            while (currentPtr >= eltPos) {
+                                pop();
+                            }
+                            resetTheInsertionMode();
+                            return;
+                        }
+                    } else {
+                        err("Stray \u201C" + name + "\u201D start tag.");
+                        return;
+                    }
+                case AFTER_BODY:
+                    if ("html" == name) {
+                        err("Stray \u201Chtml\u201D start tag.");
+                        addAttributesToElement(stack[0].node, attributes);
+                        return;
+                    } else {
+                        err("Stray \u201C" + name + "\u201D start tag.");
+                        if (conformingAndStreaming) {
+                            fatal();
+                        }
+                        phase = Phase.IN_BODY;
+                        continue;
+                    }
+                case IN_FRAMESET:
+                    if ("frameset" == name) {
+                        appendToCurrentNodeAndPushElement(name, attributes);
+                        return;
+                    } else if ("frame" == name) {
+                        appendVoidElementToCurrentMayFoster(name, attributes);
+                        return;
+                    } else {
+                        // fall through to AFTER_FRAMESET
+                    }
+                case AFTER_FRAMESET:
+                    if ("html" == name) {
+                        err("Stray \u201Chtml\u201D start tag.");
+                        addAttributesToElement(stack[0].node, attributes);
+                        return;
+                    } else if ("noframes" == name) {
+                        appendToCurrentNodeAndPushElement(name, attributes);
+                        cdataOrRcdataTimesToPop = 1;
+                        tokenizer.setContentModelFlag(ContentModelFlag.CDATA,
+                                name);
+                        return;
+                    } else {
+                        err("Stray \u201C" + name + "\u201D start tag.");
+                        return;
+                    }
+                case INITIAL:
+                    /*
+                     * Parse error.
+                     */
+                    if (doctypeExpectation != DoctypeExpectation.NO_DOCTYPE_ERRORS) {
+                        err("Start tag seen without seeing a doctype first.");
+                    }
+                    /*
+                     * 
+                     * Set the document to quirks mode.
+                     */
+                    documentModeInternal(DocumentMode.QUIRKS_MODE, null, null, false);
+                    /*
+                     * Then, switch to the root element phase of the tree
+                     * construction stage
+                     */
+                    phase = Phase.ROOT_ELEMENT;
+                    /*
+                     * and reprocess the current token.
+                     */
+                    continue;
+                case ROOT_ELEMENT:
+                    // optimize error check and streaming SAX by hoisting
+                    // "html" handling here.
+                    if ("html" == name) {
+                        if (attributes.getLength() == 0) {
+                            // This has the right magic side effect that it
+                            // makes attributes in SAX Tree mutable.
+                            appendHtmlElementToDocumentAndPush();
+                        } else {
+                            appendHtmlElementToDocumentAndPush(attributes);
+                        }
+                        phase = Phase.BEFORE_HEAD;
+                        return;
+                    } else {
+                        /*
+                         * Create an HTMLElement node with the tag name html, in
+                         * the HTML namespace. Append it to the Document object.
+                         */
+                        appendHtmlElementToDocumentAndPush();
+                        /* Switch to the main phase */
+                        phase = Phase.BEFORE_HEAD;
+                        /*
+                         * reprocess the current token.
+                         * 
+                         */
+                        continue;
+                    }
+                case BEFORE_HEAD:
+                    if ("html" == name) {
+                        err("Stray \u201Chtml\u201D start tag.");
+                        addAttributesToElement(stack[0].node, attributes);
+                        return;
+                    } else if ("head" == name) {
+                        /*
+                         * A start tag whose tag name is "head"
+                         * 
+                         * Create an element for the token.
+                         * 
+                         * Set the head element pointer to this new element
+                         * node.
+                         * 
+                         * Append the new element to the current node and push
+                         * it onto the stack of open elements.
+                         */
+                        appendToCurrentNodeAndPushHeadElement(attributes);
+                        /*
+                         * 
+                         * Change the insertion mode to "in head".
+                         * 
+                         */
+                        phase = Phase.IN_HEAD;
+                        return;
+                    }
+
+                    /*
+                     * Any other start tag token
+                     */
+
+                    /*
+                     * Act as if a start tag token with the tag name "head" and
+                     * no attributes had been seen,
+                     */
+                    appendToCurrentNodeAndPushHeadElement(EmptyAttributes.EMPTY_ATTRIBUTES);
+                    phase = Phase.IN_HEAD;
+                    /*
+                     * then reprocess the current token.
+                     * 
+                     * This will result in an empty head element being
+                     * generated, with the current token being reprocessed in
+                     * the "after head" insertion mode.
+                     */
+                    continue;
+                case AFTER_HEAD:
+                    if ("html" == name) {
+                        err("Stray \u201Chtml\u201D start tag.");
+                        addAttributesToElement(stack[0].node, attributes);
+                        return;
+                    } else if ("body" == name) {
+                        if (attributes.getLength() == 0) {
+                            // This has the right magic side effect that it
+                            // makes attributes in SAX Tree mutable.
+                            appendToCurrentNodeAndPushBodyElement();
+                        } else {
+                            appendToCurrentNodeAndPushBodyElement(attributes);
+                        }
+                        phase = Phase.IN_BODY;
+                        return;
+                    } else if ("frameset" == name) {
+                        appendToCurrentNodeAndPushElement(name, attributes);
+                        phase = Phase.IN_FRAMESET;
+                        return;
+                    } else if ("base" == name) {
+                        err("\u201Cbase\u201D element outside \u201Chead\u201D.");
+                        if (!nonConformingAndStreaming) {
+                            pushHeadPointerOntoStack();
+                        }
+                        appendVoidElementToCurrentMayFoster(name, attributes);
+                        if (!nonConformingAndStreaming) {
+                            pop(); // head
+                        }
+                        return;
+                    } else if ("link" == name) {
+                        err("\u201Clink\u201D element outside \u201Chead\u201D.");
+                        if (!nonConformingAndStreaming) {
+                            pushHeadPointerOntoStack();
+                        }
+                        appendVoidElementToCurrentMayFoster(name, attributes);
+                        if (!nonConformingAndStreaming) {
+                            pop(); // head
+                        }
+                        return;
+                    } else if ("meta" == name) {
+                        err("\u201Cmeta\u201D element outside \u201Chead\u201D.");
+                        // XXX do chaset stuff
+                        if (!nonConformingAndStreaming) {
+                            pushHeadPointerOntoStack();
+                        }
+                        appendVoidElementToCurrentMayFoster(name, attributes);
+                        if (!nonConformingAndStreaming) {
+                            pop(); // head
+                        }
+                        return;
+                    } else if ("script" == name) {
+                        err("\u201Cscript\u201D element between \u201Chead\u201D and \u201Cbody\u201D.");
+                        if (!nonConformingAndStreaming) {
+                            pushHeadPointerOntoStack();
+                        }
+                        appendToCurrentNodeAndPushElement(name, attributes);
+                        cdataOrRcdataTimesToPop = nonConformingAndStreaming ? 1
+                                : 2; // pops head
+                        tokenizer.setContentModelFlag(ContentModelFlag.CDATA,
+                                name);
+                        return;
+                    } else if ("style" == name) {
+                        err("\u201Cstyle\u201D element between \u201Chead\u201D and \u201Cbody\u201D.");
+                        if (!nonConformingAndStreaming) {
+                            pushHeadPointerOntoStack();
+                        }
+                        appendToCurrentNodeAndPushElement(name, attributes);
+                        cdataOrRcdataTimesToPop = nonConformingAndStreaming ? 1
+                                : 2; // pops head
+                        tokenizer.setContentModelFlag(ContentModelFlag.CDATA,
+                                name);
+                        return;
+                    } else if ("title" == name) {
+                        err("\u201Ctitle\u201D element outside \u201Chead\u201D.");
+                        if (!nonConformingAndStreaming) {
+                            pushHeadPointerOntoStack();
+                        }
+                        appendToCurrentNodeAndPushElement(name, attributes);
+                        cdataOrRcdataTimesToPop = nonConformingAndStreaming ? 1
+                                : 2; // pops head
+                        tokenizer.setContentModelFlag(ContentModelFlag.RCDATA,
+                                name);
+                        return;
+                    } else {
+                        appendToCurrentNodeAndPushBodyElement();
+                        phase = Phase.IN_BODY;
+                        continue;
+                    }
+                case TRAILING_END:
+                    err("Stray \u201C" + name + "\u201D start tag.");
+                    if (conformingAndStreaming) {
+                        fatal();
+                    }
+                    phase = previousPhaseBeforeTrailingEnd;
+                    continue;
+            }
+        }
+    }
+
+    public final void endTag(String name, Attributes attributes)
+            throws SAXException {
+        needToDropLF = false;
+        if (cdataOrRcdataTimesToPop > 0) {
+            while (cdataOrRcdataTimesToPop > 0) {
+                pop();

[... 1509 lines stripped ...]


Mime
View raw message