abdera-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jmsn...@apache.org
Subject svn commit: r587550 [3/6] - in /incubator/abdera/java/trunk/extensions/json/src/main: java/nu/ java/nu/validator/ java/nu/validator/htmlparser/ java/nu/validator/htmlparser/common/ java/nu/validator/htmlparser/impl/ java/nu/validator/htmlparser/sax/ ja...
Date Tue, 23 Oct 2007 16:28:58 GMT
Added: incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/Tokenizer.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/Tokenizer.java?rev=587550&view=auto
==============================================================================
--- incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/Tokenizer.java (added)
+++ incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/Tokenizer.java Tue Oct 23 09:28:51 2007
@@ -0,0 +1,4079 @@
+/*
+ * Copyright (c) 2005, 2006, 2007 Henri Sivonen
+ * Copyright (c) 2007 Mozilla Foundation
+ * Portions of comments Copyright 2004-2007 Apple Computer, Inc., Mozilla 
+ * Foundation, and Opera Software ASA.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a 
+ * copy of this software and associated documentation files (the "Software"), 
+ * to deal in the Software without restriction, including without limitation 
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+ * and/or sell copies of the Software, and to permit persons to whom the 
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * The comments following this one that use the same comment syntax as this 
+ * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007 
+ * amended as of June 23 2007.
+ * That document came with this statement:
+ * "© Copyright 2004-2007 Apple Computer, Inc., Mozilla Foundation, and 
+ * Opera Software ASA. You are granted a license to use, reproduce and 
+ * create derivative works of this document."
+ */
+
+package nu.validator.htmlparser.impl;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.UnsupportedCharsetException;
+import java.util.Arrays;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import nu.validator.htmlparser.common.XmlViolationPolicy;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.ErrorHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+import org.xml.sax.SAXParseException;
+
+/**
+ * An implementatition of
+ * http://www.whatwg.org/specs/web-apps/current-work/multipage/section-tokenisation.html
+ * 
+ * This class implements the <code>Locator</code> interface. This is not an
+ * incidental implementation detail: Users of this class are encouraged to make
+ * use of the <code>Locator</code> nature.
+ * 
+ * By default, the tokenizer may report data that XML 1.0 bans. The tokenizer
+ * can be configured to treat these conditions as fatal or to coerce the infoset
+ * to something that XML 1.0 allows.
+ * 
+ * @version $Id: Tokenizer.java 153 2007-09-11 07:41:33Z hsivonen $
+ * @author hsivonen
+ */
+public final class Tokenizer implements Locator {
+
+    private static final Pattern NCNAME_PATTERN = Pattern.compile("(?:[\\u0041-\\u005A]|[\\u0061-\\u007A]|[\\u00C0-\\u00D6]|[\\u00D8-\\u00F6]|[\\u00F8-\\u00FF]|[\\u0100-\\u0131]|[\\u0134-\\u013E]|[\\u0141-\\u0148]|[\\u014A-\\u017E]|[\\u0180-\\u01C3]|[\\u01CD-\\u01F0]|[\\u01F4-\\u01F5]|[\\u01FA-\\u0217]|[\\u0250-\\u02A8]|[\\u02BB-\\u02C1]|\\u0386|[\\u0388-\\u038A]|\\u038C|[\\u038E-\\u03A1]|[\\u03A3-\\u03CE]|[\\u03D0-\\u03D6]|\\u03DA|\\u03DC|\\u03DE|\\u03E0|[\\u03E2-\\u03F3]|[\\u0401-\\u040C]|[\\u040E-\\u044F]|[\\u0451-\\u045C]|[\\u045E-\\u0481]|[\\u0490-\\u04C4]|[\\u04C7-\\u04C8]|[\\u04CB-\\u04CC]|[\\u04D0-\\u04EB]|[\\u04EE-\\u04F5]|[\\u04F8-\\u04F9]|[\\u0531-\\u0556]|\\u0559|[\\u0561-\\u0586]|[\\u05D0-\\u05EA]|[\\u05F0-\\u05F2]|[\\u0621-\\u063A]|[\\u0641-\\u064A]|[\\u0671-\\u06B7]|[\\u06BA-\\u06BE]|[\\u06C0-\\u06CE]|[\\u06D0-\\u06D3]|\\u06D5|[\\u06E5-\\u06E6]|[\\u0905-\\u0939]|\\u093D|[\\u0958-\\u0961]|[\\u0985-\\u098C]|[\\u098F-\\u0990]|[\\u0993-\\u09A8]|[\\u09AA-\\u09B0]|\
 \u09B2|[\\u09B6-\\u09B9]|[\\u09DC-\\u09DD]|[\\u09DF-\\u09E1]|[\\u09F0-\\u09F1]|[\\u0A05-\\u0A0A]|[\\u0A0F-\\u0A10]|[\\u0A13-\\u0A28]|[\\u0A2A-\\u0A30]|[\\u0A32-\\u0A33]|[\\u0A35-\\u0A36]|[\\u0A38-\\u0A39]|[\\u0A59-\\u0A5C]|\\u0A5E|[\\u0A72-\\u0A74]|[\\u0A85-\\u0A8B]|\\u0A8D|[\\u0A8F-\\u0A91]|[\\u0A93-\\u0AA8]|[\\u0AAA-\\u0AB0]|[\\u0AB2-\\u0AB3]|[\\u0AB5-\\u0AB9]|\\u0ABD|\\u0AE0|[\\u0B05-\\u0B0C]|[\\u0B0F-\\u0B10]|[\\u0B13-\\u0B28]|[\\u0B2A-\\u0B30]|[\\u0B32-\\u0B33]|[\\u0B36-\\u0B39]|\\u0B3D|[\\u0B5C-\\u0B5D]|[\\u0B5F-\\u0B61]|[\\u0B85-\\u0B8A]|[\\u0B8E-\\u0B90]|[\\u0B92-\\u0B95]|[\\u0B99-\\u0B9A]|\\u0B9C|[\\u0B9E-\\u0B9F]|[\\u0BA3-\\u0BA4]|[\\u0BA8-\\u0BAA]|[\\u0BAE-\\u0BB5]|[\\u0BB7-\\u0BB9]|[\\u0C05-\\u0C0C]|[\\u0C0E-\\u0C10]|[\\u0C12-\\u0C28]|[\\u0C2A-\\u0C33]|[\\u0C35-\\u0C39]|[\\u0C60-\\u0C61]|[\\u0C85-\\u0C8C]|[\\u0C8E-\\u0C90]|[\\u0C92-\\u0CA8]|[\\u0CAA-\\u0CB3]|[\\u0CB5-\\u0CB9]|\\u0CDE|[\\u0CE0-\\u0CE1]|[\\u0D05-\\u0D0C]|[\\u0D0E-\\u0D10]|[\\u0D12-\\u0D28]|[\\u0D2A
 -\\u0D39]|[\\u0D60-\\u0D61]|[\\u0E01-\\u0E2E]|\\u0E30|[\\u0E32-\\u0E33]|[\\u0E40-\\u0E45]|[\\u0E81-\\u0E82]|\\u0E84|[\\u0E87-\\u0E88]|\\u0E8A|\\u0E8D|[\\u0E94-\\u0E97]|[\\u0E99-\\u0E9F]|[\\u0EA1-\\u0EA3]|\\u0EA5|\\u0EA7|[\\u0EAA-\\u0EAB]|[\\u0EAD-\\u0EAE]|\\u0EB0|[\\u0EB2-\\u0EB3]|\\u0EBD|[\\u0EC0-\\u0EC4]|[\\u0F40-\\u0F47]|[\\u0F49-\\u0F69]|[\\u10A0-\\u10C5]|[\\u10D0-\\u10F6]|\\u1100|[\\u1102-\\u1103]|[\\u1105-\\u1107]|\\u1109|[\\u110B-\\u110C]|[\\u110E-\\u1112]|\\u113C|\\u113E|\\u1140|\\u114C|\\u114E|\\u1150|[\\u1154-\\u1155]|\\u1159|[\\u115F-\\u1161]|\\u1163|\\u1165|\\u1167|\\u1169|[\\u116D-\\u116E]|[\\u1172-\\u1173]|\\u1175|\\u119E|\\u11A8|\\u11AB|[\\u11AE-\\u11AF]|[\\u11B7-\\u11B8]|\\u11BA|[\\u11BC-\\u11C2]|\\u11EB|\\u11F0|\\u11F9|[\\u1E00-\\u1E9B]|[\\u1EA0-\\u1EF9]|[\\u1F00-\\u1F15]|[\\u1F18-\\u1F1D]|[\\u1F20-\\u1F45]|[\\u1F48-\\u1F4D]|[\\u1F50-\\u1F57]|\\u1F59|\\u1F5B|\\u1F5D|[\\u1F5F-\\u1F7D]|[\\u1F80-\\u1FB4]|[\\u1FB6-\\u1FBC]|\\u1FBE|[\\u1FC2-\\u1FC4]|[\\u1FC6-\\u1
 FCC]|[\\u1FD0-\\u1FD3]|[\\u1FD6-\\u1FDB]|[\\u1FE0-\\u1FEC]|[\\u1FF2-\\u1FF4]|[\\u1FF6-\\u1FFC]|\\u2126|[\\u212A-\\u212B]|\\u212E|[\\u2180-\\u2182]|[\\u3041-\\u3094]|[\\u30A1-\\u30FA]|[\\u3105-\\u312C]|[\\uAC00-\\uD7A3]|[\\u4E00-\\u9FA5]|\\u3007|[\\u3021-\\u3029]|_)(?:[\\u0030-\\u0039]|[\\u0660-\\u0669]|[\\u06F0-\\u06F9]|[\\u0966-\\u096F]|[\\u09E6-\\u09EF]|[\\u0A66-\\u0A6F]|[\\u0AE6-\\u0AEF]|[\\u0B66-\\u0B6F]|[\\u0BE7-\\u0BEF]|[\\u0C66-\\u0C6F]|[\\u0CE6-\\u0CEF]|[\\u0D66-\\u0D6F]|[\\u0E50-\\u0E59]|[\\u0ED0-\\u0ED9]|[\\u0F20-\\u0F29]|[\\u0041-\\u005A]|[\\u0061-\\u007A]|[\\u00C0-\\u00D6]|[\\u00D8-\\u00F6]|[\\u00F8-\\u00FF]|[\\u0100-\\u0131]|[\\u0134-\\u013E]|[\\u0141-\\u0148]|[\\u014A-\\u017E]|[\\u0180-\\u01C3]|[\\u01CD-\\u01F0]|[\\u01F4-\\u01F5]|[\\u01FA-\\u0217]|[\\u0250-\\u02A8]|[\\u02BB-\\u02C1]|\\u0386|[\\u0388-\\u038A]|\\u038C|[\\u038E-\\u03A1]|[\\u03A3-\\u03CE]|[\\u03D0-\\u03D6]|\\u03DA|\\u03DC|\\u03DE|\\u03E0|[\\u03E2-\\u03F3]|[\\u0401-\\u040C]|[\\u040E-\\u044F]|[\\u045
 1-\\u045C]|[\\u045E-\\u0481]|[\\u0490-\\u04C4]|[\\u04C7-\\u04C8]|[\\u04CB-\\u04CC]|[\\u04D0-\\u04EB]|[\\u04EE-\\u04F5]|[\\u04F8-\\u04F9]|[\\u0531-\\u0556]|\\u0559|[\\u0561-\\u0586]|[\\u05D0-\\u05EA]|[\\u05F0-\\u05F2]|[\\u0621-\\u063A]|[\\u0641-\\u064A]|[\\u0671-\\u06B7]|[\\u06BA-\\u06BE]|[\\u06C0-\\u06CE]|[\\u06D0-\\u06D3]|\\u06D5|[\\u06E5-\\u06E6]|[\\u0905-\\u0939]|\\u093D|[\\u0958-\\u0961]|[\\u0985-\\u098C]|[\\u098F-\\u0990]|[\\u0993-\\u09A8]|[\\u09AA-\\u09B0]|\\u09B2|[\\u09B6-\\u09B9]|[\\u09DC-\\u09DD]|[\\u09DF-\\u09E1]|[\\u09F0-\\u09F1]|[\\u0A05-\\u0A0A]|[\\u0A0F-\\u0A10]|[\\u0A13-\\u0A28]|[\\u0A2A-\\u0A30]|[\\u0A32-\\u0A33]|[\\u0A35-\\u0A36]|[\\u0A38-\\u0A39]|[\\u0A59-\\u0A5C]|\\u0A5E|[\\u0A72-\\u0A74]|[\\u0A85-\\u0A8B]|\\u0A8D|[\\u0A8F-\\u0A91]|[\\u0A93-\\u0AA8]|[\\u0AAA-\\u0AB0]|[\\u0AB2-\\u0AB3]|[\\u0AB5-\\u0AB9]|\\u0ABD|\\u0AE0|[\\u0B05-\\u0B0C]|[\\u0B0F-\\u0B10]|[\\u0B13-\\u0B28]|[\\u0B2A-\\u0B30]|[\\u0B32-\\u0B33]|[\\u0B36-\\u0B39]|\\u0B3D|[\\u0B5C-\\u0B5D]|[\\u0B
 5F-\\u0B61]|[\\u0B85-\\u0B8A]|[\\u0B8E-\\u0B90]|[\\u0B92-\\u0B95]|[\\u0B99-\\u0B9A]|\\u0B9C|[\\u0B9E-\\u0B9F]|[\\u0BA3-\\u0BA4]|[\\u0BA8-\\u0BAA]|[\\u0BAE-\\u0BB5]|[\\u0BB7-\\u0BB9]|[\\u0C05-\\u0C0C]|[\\u0C0E-\\u0C10]|[\\u0C12-\\u0C28]|[\\u0C2A-\\u0C33]|[\\u0C35-\\u0C39]|[\\u0C60-\\u0C61]|[\\u0C85-\\u0C8C]|[\\u0C8E-\\u0C90]|[\\u0C92-\\u0CA8]|[\\u0CAA-\\u0CB3]|[\\u0CB5-\\u0CB9]|\\u0CDE|[\\u0CE0-\\u0CE1]|[\\u0D05-\\u0D0C]|[\\u0D0E-\\u0D10]|[\\u0D12-\\u0D28]|[\\u0D2A-\\u0D39]|[\\u0D60-\\u0D61]|[\\u0E01-\\u0E2E]|\\u0E30|[\\u0E32-\\u0E33]|[\\u0E40-\\u0E45]|[\\u0E81-\\u0E82]|\\u0E84|[\\u0E87-\\u0E88]|\\u0E8A|\\u0E8D|[\\u0E94-\\u0E97]|[\\u0E99-\\u0E9F]|[\\u0EA1-\\u0EA3]|\\u0EA5|\\u0EA7|[\\u0EAA-\\u0EAB]|[\\u0EAD-\\u0EAE]|\\u0EB0|[\\u0EB2-\\u0EB3]|\\u0EBD|[\\u0EC0-\\u0EC4]|[\\u0F40-\\u0F47]|[\\u0F49-\\u0F69]|[\\u10A0-\\u10C5]|[\\u10D0-\\u10F6]|\\u1100|[\\u1102-\\u1103]|[\\u1105-\\u1107]|\\u1109|[\\u110B-\\u110C]|[\\u110E-\\u1112]|\\u113C|\\u113E|\\u1140|\\u114C|\\u114E|\\u1150|[\\u1
 154-\\u1155]|\\u1159|[\\u115F-\\u1161]|\\u1163|\\u1165|\\u1167|\\u1169|[\\u116D-\\u116E]|[\\u1172-\\u1173]|\\u1175|\\u119E|\\u11A8|\\u11AB|[\\u11AE-\\u11AF]|[\\u11B7-\\u11B8]|\\u11BA|[\\u11BC-\\u11C2]|\\u11EB|\\u11F0|\\u11F9|[\\u1E00-\\u1E9B]|[\\u1EA0-\\u1EF9]|[\\u1F00-\\u1F15]|[\\u1F18-\\u1F1D]|[\\u1F20-\\u1F45]|[\\u1F48-\\u1F4D]|[\\u1F50-\\u1F57]|\\u1F59|\\u1F5B|\\u1F5D|[\\u1F5F-\\u1F7D]|[\\u1F80-\\u1FB4]|[\\u1FB6-\\u1FBC]|\\u1FBE|[\\u1FC2-\\u1FC4]|[\\u1FC6-\\u1FCC]|[\\u1FD0-\\u1FD3]|[\\u1FD6-\\u1FDB]|[\\u1FE0-\\u1FEC]|[\\u1FF2-\\u1FF4]|[\\u1FF6-\\u1FFC]|\\u2126|[\\u212A-\\u212B]|\\u212E|[\\u2180-\\u2182]|[\\u3041-\\u3094]|[\\u30A1-\\u30FA]|[\\u3105-\\u312C]|[\\uAC00-\\uD7A3]|[\\u4E00-\\u9FA5]|\\u3007|[\\u3021-\\u3029]|_|\\.|-|[\\u0300-\\u0345]|[\\u0360-\\u0361]|[\\u0483-\\u0486]|[\\u0591-\\u05A1]|[\\u05A3-\\u05B9]|[\\u05BB-\\u05BD]|\\u05BF|[\\u05C1-\\u05C2]|\\u05C4|[\\u064B-\\u0652]|\\u0670|[\\u06D6-\\u06DC]|[\\u06DD-\\u06DF]|[\\u06E0-\\u06E4]|[\\u06E7-\\u06E8]|[\\u06EA-\
 \u06ED]|[\\u0901-\\u0903]|\\u093C|[\\u093E-\\u094C]|\\u094D|[\\u0951-\\u0954]|[\\u0962-\\u0963]|[\\u0981-\\u0983]|\\u09BC|\\u09BE|\\u09BF|[\\u09C0-\\u09C4]|[\\u09C7-\\u09C8]|[\\u09CB-\\u09CD]|\\u09D7|[\\u09E2-\\u09E3]|\\u0A02|\\u0A3C|\\u0A3E|\\u0A3F|[\\u0A40-\\u0A42]|[\\u0A47-\\u0A48]|[\\u0A4B-\\u0A4D]|[\\u0A70-\\u0A71]|[\\u0A81-\\u0A83]|\\u0ABC|[\\u0ABE-\\u0AC5]|[\\u0AC7-\\u0AC9]|[\\u0ACB-\\u0ACD]|[\\u0B01-\\u0B03]|\\u0B3C|[\\u0B3E-\\u0B43]|[\\u0B47-\\u0B48]|[\\u0B4B-\\u0B4D]|[\\u0B56-\\u0B57]|[\\u0B82-\\u0B83]|[\\u0BBE-\\u0BC2]|[\\u0BC6-\\u0BC8]|[\\u0BCA-\\u0BCD]|\\u0BD7|[\\u0C01-\\u0C03]|[\\u0C3E-\\u0C44]|[\\u0C46-\\u0C48]|[\\u0C4A-\\u0C4D]|[\\u0C55-\\u0C56]|[\\u0C82-\\u0C83]|[\\u0CBE-\\u0CC4]|[\\u0CC6-\\u0CC8]|[\\u0CCA-\\u0CCD]|[\\u0CD5-\\u0CD6]|[\\u0D02-\\u0D03]|[\\u0D3E-\\u0D43]|[\\u0D46-\\u0D48]|[\\u0D4A-\\u0D4D]|\\u0D57|\\u0E31|[\\u0E34-\\u0E3A]|[\\u0E47-\\u0E4E]|\\u0EB1|[\\u0EB4-\\u0EB9]|[\\u0EBB-\\u0EBC]|[\\u0EC8-\\u0ECD]|[\\u0F18-\\u0F19]|\\u0F35|\\u0F37|\\u0F39|\
 \u0F3E|\\u0F3F|[\\u0F71-\\u0F84]|[\\u0F86-\\u0F8B]|[\\u0F90-\\u0F95]|\\u0F97|[\\u0F99-\\u0FAD]|[\\u0FB1-\\u0FB7]|\\u0FB9|[\\u20D0-\\u20DC]|\\u20E1|[\\u302A-\\u302F]|\\u3099|\\u309A|\\u00B7|\\u02D0|\\u02D1|\\u0387|\\u0640|\\u0E46|\\u0EC6|\\u3005|[\\u3031-\\u3035]|[\\u309D-\\u309E]|[\\u30FC-\\u30FE])*");
+    
+    /**
+     * Magic value for UTF-16 operations.
+     */
+    private static final int LEAD_OFFSET = 0xD800 - (0x10000 >> 10);
+
+    /**
+     * Magic value for UTF-16 operations.
+     */
+    private static final int SURROGATE_OFFSET = 0x10000 - (0xD800 << 10) - 0xDC00;
+
+    /**
+     * UTF-16 code unit array containing less than and greater than for emitting
+     * those characters on certain parse errors.
+     */
+    private static final char[] LT_GT = { '<', '>' };
+
+    /**
+     * UTF-16 code unit array containing less than and solidus for emitting
+     * those characters on certain parse errors.
+     */
+    private static final char[] LT_SOLIDUS = { '<', '/' };
+
+    /**
+     * Array version of U+FFFD.
+     */
+    private static final char[] REPLACEMENT_CHARACTER = { '\uFFFD' };
+
+    /**
+     * Array version of space.
+     */
+    private static final char[] SPACE = { ' ' };
+
+    /**
+     * Array version of line feed.
+     */
+    private static final char[] LF = { '\n' };
+
+    /**
+     * Buffer growth parameter.
+     */
+    private static final int BUFFER_GROW_BY = 1024;
+
+    /**
+     * Lexically sorted void element names
+     */
+    private static final String[] VOID_ELEMENTS = { "area", "base", "br",
+            "col", "embed", "hr", "img", "input", "link", "meta", "param" };
+
+    /**
+     * "octype" as <code>char[]</code>
+     */
+    private static final char[] OCTYPE = "octype".toCharArray();
+
+    /**
+     * "ublic" as <code>char[]</code>
+     */
+    private static final char[] UBLIC = "ublic".toCharArray();
+
+    /**
+     * "ystem" as <code>char[]</code>
+     */
+    private static final char[] YSTEM = "ystem".toCharArray();
+
+    /**
+     * The token handler.
+     */
+    private final TokenHandler tokenHandler;
+
+    /**
+     * The error handler.
+     */
+    private ErrorHandler errorHandler;
+
+    /**
+     * The input UTF-16 code unit stream. If a byte stream was given, this
+     * object is an instance of <code>HtmlInputStreamReader</code>.
+     */
+    private Reader reader;
+
+    /**
+     * The main input buffer that the tokenizer reads from. Filled from
+     * <code>reader</code>.
+     */
+    private char[] buf = new char[2048];
+
+    /**
+     * The index of the last <code>char</code> read from <code>buf</code>.
+     */
+    private int pos;
+
+    /**
+     * The index of the first <code>char</code> in <code>buf</code> that is
+     * part of a coalesced run of character tokens or <code>-1</code> if there
+     * is not a current run being coalesced.
+     */
+    private int cstart;
+
+    /**
+     * The number of <code>char</code>s in <code>buf</code> that have
+     * meaning. (The rest of the array is garbage and should not be examined.)
+     */
+    private int bufLen;
+
+    /**
+     * The previous <code>char</code> read from the buffer with infoset
+     * alteration applied except for CR. Used for CRLF normalization and
+     * surrogate pair checking.
+     */
+    private char prev;
+
+    /**
+     * Lookbehind buffer for magic RCDATA/CDATA escaping.
+     */
+    private final char[] prevFour = new char[4];
+
+    /**
+     * Points to the last <code>char</code> written to <code>prevFour</code>.
+     */
+    private int prevFourPtr = 0;
+
+    /**
+     * Single code unit buffer for reconsuming an input character. If
+     * <code>-1</code> the next <code>read()</code> returns from the real
+     * buffer, otherwise from here.
+     */
+    private int unreadBuffer = -1;
+
+    /**
+     * The current line number in the current resource being parsed. (First line
+     * is 1.) Passed on as locator data.
+     */
+    private int line;
+
+    /**
+     * The current column number in the current resource being tokenized. (First
+     * column is 1, counted by UTF-16 code units.) Passed on as locator data.
+     */
+    private int col;
+
+    /**
+     * The SAX public id for the resource being tokenized. (Only passed to back
+     * as part of locator data.)
+     */
+    private String publicId;
+
+    /**
+     * The SAX system id for the resource being tokenized. (Only passed to back
+     * as part of locator data.)
+     */
+    private String systemId;
+
+    /**
+     * Buffer for short identifiers.
+     */
+    private char[] strBuf = new char[64];
+
+    /**
+     * Number of significant <code>char</code>s in <code>strBuf</code>.
+     */
+    private int strBufLen = 0;
+
+    /**
+     * Buffer for long strings.
+     */
+    private char[] longStrBuf = new char[1024];
+
+    /**
+     * Number of significant <code>char</code>s in <code>longStrBuf</code>.
+     */
+    private int longStrBufLen = 0;
+
+    /**
+     * If not U+0000, a pending code unit to be appended to
+     * <code>longStrBuf</code>.
+     */
+    private char longStrBufPending = '\u0000';
+
+    /**
+     * The attribute holder.
+     */
+    private AttributesImpl attributes;
+
+    /**
+     * Buffer for expanding NCRs falling into the Basic Multilingual Plane.
+     */
+    private final char[] bmpChar = new char[1];
+
+    /**
+     * Buffer for expanding astral NCRs.
+     */
+    private final char[] astralChar = new char[2];
+
+    /**
+     * Keeps track of PUA warnings.
+     */
+    private boolean alreadyWarnedAboutPrivateUseCharacters;
+
+    /**
+     * http://www.whatwg.org/specs/web-apps/current-work/#content2
+     */
+    private ContentModelFlag contentModelFlag = ContentModelFlag.PCDATA;
+
+    /**
+     * http://www.whatwg.org/specs/web-apps/current-work/#escape
+     */
+    private boolean escapeFlag = false;
+
+    /**
+     * The element whose end tag closes the current CDATA or RCDATA element.
+     */
+    private String contentModelElement = "";
+
+    /**
+     * <code>true</code> if tokenizing an end tag
+     */
+    private boolean endTag;
+
+    /**
+     * The current tag token name.
+     */
+    private String tagName = null;
+
+    /**
+     * The current attribute name.
+     */
+    private String attributeName = null;
+
+    /**
+     * Whether comment tokens are emitted.
+     */
+    private boolean wantsComments = false;
+
+    /**
+     * If <code>false</code>, <code>addAttribute*()</code> are no-ops.
+     */
+    private boolean shouldAddAttributes;
+
+    /**
+     * <code>true</code> when in text content or in attribute value.
+     */
+    private boolean inContent;
+
+    /**
+     * <code>true</code> when HTML4-specific additional errors are requested.
+     */
+    private boolean html4;
+
+    /**
+     * Whether non-ASCII causes an error.
+     */
+    private boolean nonAsciiProhibited;
+
+    /**
+     * Used together with <code>nonAsciiProhibited</code>.
+     */
+    private boolean alreadyComplainedAboutNonAscii;
+
+    /**
+     * Whether the stream is past the first 512 bytes.
+     */
+    private boolean metaBoundaryPassed;
+
+    /**
+     * The name of the current doctype token.
+     */
+    private String doctypeName;
+
+    /**
+     * The public id of the current doctype token.
+     */
+    private String publicIdentifier;
+
+    /**
+     * The system id of the current doctype token.
+     */
+    private String systemIdentifier;
+
+    /**
+     * Used for NFC checking if non-<code>null</code>.
+     */
+    //private NormalizationChecker normalizationChecker = null;
+
+    /**
+     * The policy for vertical tab and form feed.
+     */
+    private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.ALLOW;
+
+    /**
+     * The policy for non-space non-XML characters.
+     */
+    private XmlViolationPolicy contentNonXmlCharPolicy = XmlViolationPolicy.ALLOW;
+
+    /**
+     * The policy for comments.
+     */
+    private XmlViolationPolicy commentPolicy = XmlViolationPolicy.ALLOW;
+
+    private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.ALLOW;
+
+    private XmlViolationPolicy namePolicy = XmlViolationPolicy.ALLOW;
+    
+    private boolean swallowBom;
+
+    private boolean html4ModeCompatibleWithXhtml1Schemata;
+
+    private boolean mappingLangToXmlLang;
+
+    private XmlViolationPolicy bogusXmlnsPolicy;
+
+    // start public API
+
+    /**
+     * The constuctor.
+     * 
+     * @param tokenHandler
+     *            the handler for receiving tokens
+     */
+    public Tokenizer(TokenHandler tokenHandler) {
+        this.tokenHandler = tokenHandler;
+    }
+
+    /**
+     * Turns NFC checking on or off.
+     * 
+     * @param enable
+     *            <code>true</code> if checking on
+     */
+    public void setCheckingNormalization(boolean enable) {
+//        if (enable) {
+//            normalizationChecker = new NormalizationChecker(this);
+//            normalizationChecker.setErrorHandler(errorHandler);
+//        } else {
+//            normalizationChecker = null;
+//        }
+    }
+
+    /**
+     * Query if checking normalization.
+     * 
+     * @return <code>true</code> if checking on
+     */
+    public boolean isCheckingNormalization() {
+//        return normalizationChecker != null;
+      return false;
+    }
+
+    /**
+     * Sets the error handler.
+     * 
+     * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
+     */
+    public void setErrorHandler(ErrorHandler eh) {
+        this.errorHandler = eh;
+//        if (this.normalizationChecker != null) {
+//            this.normalizationChecker.setErrorHandler(eh);
+//        }
+    }
+
+    /**
+     * Returns the commentPolicy.
+     * 
+     * @return the commentPolicy
+     */
+    public XmlViolationPolicy getCommentPolicy() {
+        return commentPolicy;
+    }
+
+    /**
+     * Sets the commentPolicy.
+     * 
+     * @param commentPolicy
+     *            the commentPolicy to set
+     */
+    public void setCommentPolicy(XmlViolationPolicy commentPolicy) {
+        this.commentPolicy = commentPolicy;
+    }
+
+    /**
+     * Returns the contentNonXmlCharPolicy.
+     * 
+     * @return the contentNonXmlCharPolicy
+     */
+    public XmlViolationPolicy getContentNonXmlCharPolicy() {
+        return contentNonXmlCharPolicy;
+    }
+
+    /**
+     * Sets the contentNonXmlCharPolicy.
+     * 
+     * @param contentNonXmlCharPolicy
+     *            the contentNonXmlCharPolicy to set
+     */
+    public void setContentNonXmlCharPolicy(
+            XmlViolationPolicy contentNonXmlCharPolicy) {
+        this.contentNonXmlCharPolicy = contentNonXmlCharPolicy;
+    }
+
+    /**
+     * Returns the contentSpacePolicy.
+     * 
+     * @return the contentSpacePolicy
+     */
+    public XmlViolationPolicy getContentSpacePolicy() {
+        return contentSpacePolicy;
+    }
+
+    /**
+     * Sets the contentSpacePolicy.
+     * 
+     * @param contentSpacePolicy
+     *            the contentSpacePolicy to set
+     */
+    public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {
+        this.contentSpacePolicy = contentSpacePolicy;
+    }
+
+    /**
+     * Sets the xmlnsPolicy.
+     * 
+     * @param xmlnsPolicy
+     *            the xmlnsPolicy to set
+     */
+    public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) {
+        if (xmlnsPolicy == XmlViolationPolicy.FATAL) {
+            throw new IllegalArgumentException("Can't use FATAL here.");
+        }
+        this.xmlnsPolicy = xmlnsPolicy;
+    }
+
+    public void setNamePolicy(XmlViolationPolicy namePolicy) {
+        this.namePolicy = namePolicy;
+    }
+    
+    /**
+     * Sets the bogusXmlnsPolicy.
+     * 
+     * @param bogusXmlnsPolicy the bogusXmlnsPolicy to set
+     */
+    public void setBogusXmlnsPolicy(XmlViolationPolicy bogusXmlnsPolicy) {
+        this.bogusXmlnsPolicy = bogusXmlnsPolicy;
+    }
+
+    /**
+     * Sets the html4ModeCompatibleWithXhtml1Schemata.
+     * 
+     * @param html4ModeCompatibleWithXhtml1Schemata
+     *            the html4ModeCompatibleWithXhtml1Schemata to set
+     */
+    public void setHtml4ModeCompatibleWithXhtml1Schemata(
+            boolean html4ModeCompatibleWithXhtml1Schemata) {
+        this.html4ModeCompatibleWithXhtml1Schemata = html4ModeCompatibleWithXhtml1Schemata;
+    }
+
+    /**
+     * Runs the tokenization. This is the main entry point.
+     * 
+     * @param is
+     *            the input source
+     * @throws SAXException
+     *             on fatal error (if configured to treat XML violations as
+     *             fatal) or if the token handler threw
+     * @throws IOException
+     *             if the stream threw
+     */
+    public void tokenize(InputSource is) throws SAXException, IOException {
+        if (is == null) {
+            throw new IllegalArgumentException("InputSource was null.");
+        }
+        swallowBom = true;
+        this.systemId = is.getSystemId();
+        this.publicId = is.getPublicId();
+        this.reader = is.getCharacterStream();
+        CharsetDecoder decoder = decoderFromExternalDeclaration(is.getEncoding());
+        if (this.reader == null) {
+            InputStream inputStream = is.getByteStream();
+            if (inputStream == null) {
+                throw new SAXException("Both streams in InputSource were null.");
+            }
+            if (decoder == null) {
+                this.reader = new HtmlInputStreamReader(inputStream,
+                        errorHandler, this, this);
+            } else {
+                this.reader = new HtmlInputStreamReader(inputStream,
+                        errorHandler, this, this, decoder);
+            }
+        }
+        contentModelFlag = ContentModelFlag.PCDATA;
+        escapeFlag = false;
+        inContent = true;
+        pos = -1;
+        cstart = -1;
+        line = 1;
+        col = 0;
+        prev = '\u0000';
+        bufLen = 0;
+        nonAsciiProhibited = false;
+        alreadyComplainedAboutNonAscii = false;
+        html4 = false;
+        alreadyWarnedAboutPrivateUseCharacters = false;
+        metaBoundaryPassed = false;
+        tokenHandler.start(this);
+        wantsComments = tokenHandler.wantsComments();
+        try {
+            if (swallowBom) {
+                // Swallow the BOM
+                char c = read();
+                if (c == '\uFEFF') {
+                    col = 0;
+                } else {
+                    unread(c);
+                }
+            }
+            dataState();
+        } finally {
+            systemIdentifier = null;
+            publicIdentifier = null;
+            doctypeName = null;
+            tagName = null;
+            attributeName = null;
+            tokenHandler.eof();
+            reader.close();
+        }
+    }
+
+    // For the token handler to call
+    /**
+     * Sets the content model flag and the associated element name.
+     * 
+     * @param contentModelFlag
+     *            the flag
+     * @param contentModelElement
+     *            the element causing the flag to be set
+     */
+    public void setContentModelFlag(ContentModelFlag contentModelFlag,
+            String contentModelElement) {
+        this.contentModelFlag = contentModelFlag;
+        this.contentModelElement = contentModelElement;
+    }
+
+    // start Locator impl
+
+    /**
+     * @see org.xml.sax.Locator#getPublicId()
+     */
+    public String getPublicId() {
+        return publicId;
+    }
+
+    /**
+     * @see org.xml.sax.Locator#getSystemId()
+     */
+    public String getSystemId() {
+        return systemId;
+    }
+
+    /**
+     * @see org.xml.sax.Locator#getLineNumber()
+     */
+    public int getLineNumber() {
+        return line;
+    }
+
+    /**
+     * @see org.xml.sax.Locator#getColumnNumber()
+     */
+    public int getColumnNumber() {
+        return col;
+    }
+
+    // end Locator impl
+
+    // end public API
+
+    void notifyAboutMetaBoundary() {
+        metaBoundaryPassed = true;
+    }
+
+    void turnOnAdditionalHtml4Errors() {
+        html4 = true;
+    }
+
+    void dontSwallowBom() {
+        swallowBom = false;
+    }
+
+    void noEncodingDeclared() {
+        nonAsciiProhibited = true;
+    }
+
+    AttributesImpl newAttributes() {
+        if (mappingLangToXmlLang) {
+            return new XmlLangAttributesImpl();
+        } else {
+            return new AttributesImpl();
+        }
+    }
+
+    /**
+     * Clears the smaller buffer.
+     */
+    private void clearStrBuf() {
+        strBufLen = 0;
+    }
+
+    /**
+     * Appends to the smaller buffer.
+     * 
+     * @param c
+     *            the UTF-16 code unit to append
+     */
+    private void appendStrBuf(char c) {
+        if (strBufLen == strBuf.length) {
+            char[] newBuf = new char[strBuf.length + BUFFER_GROW_BY];
+            System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length);
+            strBuf = newBuf;
+        }
+        strBuf[strBufLen++] = c;
+    }
+
+    /**
+     * The smaller buffer as a string.
+     * 
+     * @return the smaller buffer as a string
+     */
+    private String strBufToString() {
+        return new String(strBuf, 0, strBufLen);
+    }
+
+    /**
+     * Emits the smaller buffer as character tokens.
+     * 
+     * @throws SAXException
+     *             if the token handler threw
+     */
+    private void emitStrBuf() throws SAXException {
+        if (strBufLen > 0) {
+            tokenHandler.characters(strBuf, 0, strBufLen);
+        }
+    }
+
+    private boolean isNcname(String str) {
+        Matcher m = NCNAME_PATTERN.matcher(str);
+        return m.matches();
+    }
+    
+    /**
+     * Clears the larger buffer.
+     */
+    private void clearLongStrBuf() {
+        longStrBufLen = 0;
+        longStrBufPending = '\u0000';
+    }
+
+    /**
+     * Appends to the larger buffer.
+     * 
+     * @param c
+     *            the UTF-16 code unit to append
+     */
+    private void appendLongStrBuf(char c) {
+        if (longStrBufLen == longStrBuf.length) {
+            char[] newBuf = new char[longStrBuf.length + BUFFER_GROW_BY];
+            System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length);
+            longStrBuf = newBuf;
+        }
+        longStrBuf[longStrBufLen++] = c;
+    }
+
+    /**
+     * Appends to the larger buffer when it is used to buffer a comment. Checks
+     * for two consecutive hyphens.
+     * 
+     * @param c
+     *            the UTF-16 code unit to append
+     * @throws SAXException
+     */
+    private void appendToComment(char c) throws SAXException {
+        if (longStrBufPending == '-' && c == '-') {
+            if (commentPolicy == XmlViolationPolicy.FATAL) {
+                fatal("This document is not mappable to XML 1.0 without data loss to \u201C--\u201D in a comment.");
+            } else {
+                warn("This document is not mappable to XML 1.0 without data loss to \u201C--\u201D in a comment.");
+                if (wantsComments) {
+                    if (commentPolicy == XmlViolationPolicy.ALLOW) {
+                        appendLongStrBuf('-');
+                    } else {
+                        appendLongStrBuf('-');
+                        appendLongStrBuf(' ');
+                    }
+                }
+                longStrBufPending = '-';
+            }
+        } else {
+            if (longStrBufPending != '\u0000') {
+                if (wantsComments) {
+                    appendLongStrBuf(longStrBufPending);
+                }
+                longStrBufPending = '\u0000';
+            }
+            if (c == '-') {
+                longStrBufPending = '-';
+            } else {
+                if (wantsComments) {
+                    appendLongStrBuf(c);
+                }
+            }
+        }
+    }
+
+    /**
+     * Appends to the larger buffer.
+     * 
+     * @param arr
+     *            the UTF-16 code units to append
+     */
+    private void appendLongStrBuf(char[] arr) {
+        for (int i = 0; i < arr.length; i++) {
+            appendLongStrBuf(arr[i]);
+        }
+    }
+
+    /**
+     * Append the contents of the smaller buffer to the larger one.
+     */
+    private void appendStrBufToLongStrBuf() {
+        for (int i = 0; i < strBufLen; i++) {
+            appendLongStrBuf(strBuf[i]);
+        }
+    }
+
+    /**
+     * The larger buffer as a string.
+     * 
+     * @return the larger buffer as a string
+     */
+    private String longStrBufToString() {
+        if (longStrBufPending != '\u0000') {
+            appendLongStrBuf(longStrBufPending);
+        }
+        return new String(longStrBuf, 0, longStrBufLen);
+    }
+
+    /**
+     * Emits the current comment token.
+     * 
+     * @throws SAXException
+     */
+    private void emitComment() throws SAXException {
+        if (wantsComments) {
+            if (longStrBufPending != '\u0000') {
+                appendLongStrBuf(longStrBufPending);
+            }
+        }
+        tokenHandler.comment(longStrBuf, longStrBufLen);
+    }
+
+    /**
+     * Unreads a code unit so that it is returned the next time
+     * <code>read()</code> is called.
+     * 
+     * @param c
+     *            the code unit to unread
+     */
+    private void unread(char c) {
+        unreadBuffer = c;
+    }
+
+    /**
+     * Reads the next UTF-16 code unit.
+     * 
+     * @return the next code unit
+     * @throws SAXException
+     * @throws IOException
+     */
+    private char read() throws SAXException, IOException {
+        for (;;) { // the loop is here for the CRLF case
+            if (unreadBuffer != -1) {
+                char c = (char) unreadBuffer;
+                unreadBuffer = -1;
+                return c;
+            }
+            assert (bufLen > -1);
+            pos++;
+            assert pos <= bufLen;
+            col++;
+            if (pos == bufLen) {
+                boolean charDataContinuation = false;
+                if (cstart > -1) {
+                    flushChars();
+                    charDataContinuation = true;
+                }
+                bufLen = reader.read(buf);
+                assert bufLen <= buf.length;
+                if (bufLen == -1) {
+                    return '\u0000';
+                }
+//                } else if (normalizationChecker != null) {
+//                    normalizationChecker.characters(buf, 0, bufLen);
+//                }
+                if (charDataContinuation) {
+                    cstart = 0;
+                }
+                pos = 0;
+            }
+            char c = buf[pos];
+            if (c > '\u007F' && nonAsciiProhibited
+                    && !alreadyComplainedAboutNonAscii) {
+                err("The character encoding of the document was not explicit but the document contains non-ASCII.");
+            }
+            switch (c) {
+                case '\n':
+                    /*
+                     * U+000D CARRIAGE RETURN (CR) characters, and U+000A LINE
+                     * FEED (LF) characters, are treated specially. Any CR
+                     * characters that are followed by LF characters must be
+                     * removed, and any CR characters not followed by LF
+                     * characters must be converted to LF characters.
+                     */
+                    if (prev == '\r') {
+                        // swallow the LF
+                        col = 0;
+                        if (cstart != -1) {
+                            flushChars();
+                            cstart = pos + 1;
+                        }
+                        prev = c;
+                        continue;
+                    } else {
+                        line++;
+                        col = 0;
+                    }
+                    break;
+                case '\r':
+                    c = buf[pos] = '\n';
+                    line++;
+                    col = 0;
+                    prev = '\r';
+                    if (contentModelFlag != ContentModelFlag.PCDATA) {
+                        prevFourPtr++;
+                        prevFourPtr %= 4;
+                        prevFour[prevFourPtr] = c;
+                    }
+                    return c;
+                case '\u0000':
+                    /*
+                     * All U+0000 NULL characters in the input must be replaced
+                     * by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such
+                     * characters is a parse error.
+                     */
+                    err("Found U+0000 in the character stream.");
+                    c = buf[pos] = '\uFFFD';
+                    break;
+                case '\u000B':
+                case '\u000C':
+                    if (inContent) {
+                        if (contentNonXmlCharPolicy == XmlViolationPolicy.FATAL) {
+                            fatal("This document is not mappable to XML 1.0 without data loss due to a character that is not a legal XML 1.0 character.");
+                        } else {
+                            if (contentNonXmlCharPolicy == XmlViolationPolicy.ALTER_INFOSET) {
+                                c = buf[pos] = ' ';
+                            }
+                            warn("This document is not mappable to XML 1.0 without data loss due to a character that is not a legal XML 1.0 character.");
+                        }
+                    }
+                    break;
+                default:
+                    if ((c & 0xFC00) == 0xDC00) {
+                        // Got a low surrogate. See if prev was high surrogate
+                        if ((prev & 0xFC00) == 0xD800) {
+                            int intVal = (prev << 10) + c + SURROGATE_OFFSET;
+                            if (isNonCharacter(intVal)) {
+                                warn("Astral non-character.");
+                            }
+                            if (isAstralPrivateUse(intVal)) {
+                                warnAboutPrivateUseChar();
+                            }
+                        } else {
+                            // XXX figure out what to do about lone high
+                            // surrogates
+                            err("Found low surrogate without high surrogate.");
+                            c = buf[pos] = '\uFFFD';
+                        }
+                    } else if (inContent && (c < ' ' || isNonCharacter(c))
+                            && (c != '\t')) {
+                        if (contentNonXmlCharPolicy == XmlViolationPolicy.FATAL) {
+                            fatal("This document is not mappable to XML 1.0 without data loss due to a character that is not a legal XML 1.0 character.");
+                        } else {
+                            if (contentNonXmlCharPolicy == XmlViolationPolicy.ALTER_INFOSET) {
+                                c = buf[pos] = '\uFFFD';
+                            }
+                            warn("This document is not mappable to XML 1.0 without data loss due to a character that is not a legal XML 1.0 character.");
+                        }
+                    } else if (isPrivateUse(c)) {
+                        warnAboutPrivateUseChar();
+                    }
+            }
+            prev = c;
+            if (contentModelFlag != ContentModelFlag.PCDATA) {
+                prevFourPtr++;
+                prevFourPtr %= 4;
+                prevFour[prevFourPtr] = c;
+            }
+            return c;
+        }
+    }
+
+    /**
+     * Emits a warning about private use characters if the warning has not been
+     * emitted yet.
+     * 
+     * @throws SAXException
+     */
+    private void warnAboutPrivateUseChar() throws SAXException {
+        if (!alreadyWarnedAboutPrivateUseCharacters) {
+            warn("Document uses the Unicode Private Use Area(s), which should not be used in publicly exchanged documents. (Charmod C073)");
+            alreadyWarnedAboutPrivateUseCharacters = true;
+        }
+    }
+
+    /**
+     * Tells if the argument is a BMP PUA character.
+     * 
+     * @param c
+     *            the UTF-16 code unit to check
+     * @return <code>true</code> if PUA character
+     */
+    private boolean isPrivateUse(char c) {
+        return c >= '\uE000' && c <= '\uF8FF';
+    }
+
+    /**
+     * Tells if the argument is an astral PUA character.
+     * 
+     * @param c
+     *            the code point to check
+     * @return <code>true</code> if astral private use
+     */
+    private boolean isAstralPrivateUse(int c) {
+        return (c >= 0xF0000 && c <= 0xFFFFD)
+                || (c >= 0x100000 && c <= 0x10FFFD);
+    }
+
+    /**
+     * Tells if the argument is a non-character (works for BMP and astral).
+     * 
+     * @param c
+     *            the code point to check
+     * @return <code>true</code> if non-character
+     */
+    private boolean isNonCharacter(int c) {
+        return (c & 0xFFFE) == 0xFFFE;
+    }
+
+    /**
+     * Flushes coalesced character tokens.
+     * 
+     * @throws SAXException
+     */
+    private void flushChars() throws SAXException, IOException {
+        if (cstart != -1) {
+            if (pos > cstart) {
+                tokenHandler.characters(buf, cstart, pos - cstart);
+            }
+        }
+        cstart = -1;
+    }
+
+    /**
+     * Reports an condition that would make the infoset incompatible with XML
+     * 1.0 as fatal.
+     * 
+     * @param message
+     *            the message
+     * @throws SAXException
+     * @throws SAXParseException
+     */
+    private void fatal(String message) throws SAXException {
+        SAXParseException spe = new SAXParseException(message, this);
+        if (errorHandler != null) {
+            errorHandler.fatalError(spe);
+        }
+        throw spe;
+    }
+
+    /**
+     * Reports a Parse Error.
+     * 
+     * @param message
+     *            the message
+     * @throws SAXException
+     */
+    private void err(String message) throws SAXException {
+        if (errorHandler == null) {
+            return;
+        }
+        SAXParseException spe = new SAXParseException(message, this);
+        errorHandler.error(spe);
+    }
+
+    /**
+     * Reports a warning
+     * 
+     * @param message
+     *            the message
+     * @throws SAXException
+     */
+    private void warn(String message) throws SAXException {
+        if (errorHandler == null) {
+            return;
+        }
+        SAXParseException spe = new SAXParseException(message, this);
+        errorHandler.warning(spe);
+    }
+
+    /**
+     * Initializes a decoder from external decl.
+     */
+    private CharsetDecoder decoderFromExternalDeclaration(String encoding)
+            throws SAXException {
+        if (encoding == null) {
+            return null;
+        }
+        encoding = encoding.toUpperCase();
+        if ("ISO-8859-1".equals(encoding)) {
+            encoding = "Windows-1252";
+        }
+        if ("UTF-16".equals(encoding) || "UTF-32".equals(encoding)) {
+            swallowBom = false;
+        }
+        try {
+            Charset cs = Charset.forName(encoding);
+            String canonName = cs.name();
+            if (canonName.startsWith("X-") || canonName.startsWith("x-")
+                    || canonName.startsWith("Mac")) {
+                if (encoding.startsWith("X-")) {
+                    err("The encoding \u201C"
+                            + encoding
+                            + "\u201D is not an IANA-registered encoding. (Charmod C022)");
+                } else {
+                    err("The encoding \u201C"
+                            + encoding
+                            + "\u201D is not an IANA-registered encoding and did\u2019t start with \u201CX-\u201D. (Charmod C023)");
+                }
+            } else if (!canonName.equalsIgnoreCase(encoding)) {
+                err("The encoding \u201C"
+                        + encoding
+                        + "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C"
+                        + canonName + "\u201D. (Charmod C024)");
+            }
+            if (EncodingInfo.isObscure(canonName)) {
+                warn("The character encoding \u201C"
+                        + encoding
+                        + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D.");
+            }
+            return cs.newDecoder();
+        } catch (IllegalCharsetNameException e) {
+            err("Illegal character encoding name: \u201C" + encoding
+                    + "\u201D. Will sniff.");
+        } catch (UnsupportedCharsetException e) {
+            err("Unsupported character encoding name: \u201C" + encoding
+                    + "\u201D. Will sniff.");
+            swallowBom = true;
+        }
+        return null; // keep the compiler happy
+    }
+
+    private boolean currentIsVoid() {
+        return Arrays.binarySearch(VOID_ELEMENTS, tagName) > -1;
+    }
+
+    /**
+     * Data state
+     * 
+     * @throws IOException
+     * @throws SAXException
+     * 
+     */
+    private void dataState() throws SAXException, IOException {
+        char c = '\u0000';
+        for (;;) {
+            c = read();
+            if (c == '&'
+                    && (contentModelFlag == ContentModelFlag.PCDATA || (contentModelFlag == ContentModelFlag.RCDATA)
+                            && !escapeFlag)) {
+                /*
+                 * U+0026 AMPERSAND (&) When the content model flag is set to
+                 * one of the PCDATA or RCDATA states: switch to the entity data
+                 * state. Otherwise: treat it as per the "anything else" entry
+                 * below.
+                 */
+                flushChars();
+                entityDataState();
+                continue;
+            } else if (c == '<'
+                    && ((contentModelFlag == ContentModelFlag.PCDATA) || (escapeFlag == false && (contentModelFlag == ContentModelFlag.CDATA || contentModelFlag == ContentModelFlag.RCDATA)))) {
+                /*
+                 * U+003C LESS-THAN SIGN (<) When the content model flag is set
+                 * to the PCDATA state: switch to the tag open state. When the
+                 * content model flag is set to either the RCDATA state or the
+                 * CDATA state and the escape flag is false: switch to the tag
+                 * open state. Otherwise: treat it as per the "anything else"
+                 * entry below.
+                 */
+                flushChars();
+                resetAttributes();
+                inContent = false;
+                tagOpenState();
+                inContent = true;
+                continue;
+            } else if (c == '\u0000') {
+                /*
+                 * EOF Emit an end-of-file token.
+                 */
+                flushChars();
+                return; // eof() called in parent finally block
+            } else {
+                if (c == '-'
+                        && (escapeFlag == false)
+                        && (contentModelFlag == ContentModelFlag.RCDATA || contentModelFlag == ContentModelFlag.CDATA)
+                        && lastLtExclHyph()) {
+                    /*
+                     * U+002D HYPHEN-MINUS (-) If the content model flag is set
+                     * to either the RCDATA state or the CDATA state, and the
+                     * escape flag is false, and there are at least three
+                     * characters before this one in the input stream, and the
+                     * last four characters in the input stream, including this
+                     * one, are U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK,
+                     * U+002D HYPHEN-MINUS, and U+002D HYPHEN-MINUS ("<!--"),
+                     * then set the escape flag to true.
+                     * 
+                     * In any case, emit the input character as a character
+                     * token. Stay in the data state.
+                     */
+                    escapeFlag = true;
+                } else if (c == '>' && escapeFlag && lastHyphHyph()) {
+                    /*
+                     * U+003E GREATER-THAN SIGN (>) If the content model flag is
+                     * set to either the RCDATA state or the CDATA state, and
+                     * the escape flag is true, and the last three characters in
+                     * the input stream including this one are U+002D
+                     * HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN
+                     * SIGN ("-->"), set the escape flag to false.
+                     * 
+                     * In any case, emit the input character as a character
+                     * token. Stay in the data state.
+                     */
+                    escapeFlag = false;
+                }
+                /*
+                 * Anything else Emit the input character as a character token.
+                 */
+                if (cstart == -1) {
+                    // start coalescing character tokens
+                    cstart = pos;
+                }
+                /*
+                 * Stay in the data state.
+                 */
+                continue;
+            }
+        }
+    }
+
+    private boolean lastHyphHyph() {
+        return prevFour[(prevFourPtr - 1 + 4) % 4] == '-'
+                && prevFour[(prevFourPtr - 2 + 4) % 4] == '-';
+    }
+
+    private boolean lastLtExclHyph() {
+        return prevFour[(prevFourPtr - 1 + 4) % 4] == '-'
+                && prevFour[(prevFourPtr - 2 + 4) % 4] == '!'
+                && prevFour[(prevFourPtr - 3 + 4) % 4] == '<';
+    }
+
+    /**
+     * 
+     * Entity data state
+     * 
+     * @throws IOException
+     * @throws SAXException
+     */
+    private void entityDataState() throws SAXException, IOException {
+        /*
+         * (This cannot happen if the content model flag is set to the CDATA
+         * state.)
+         * 
+         * Attempt to consume an entity.
+         */
+        consumeEntity(false);
+        /*
+         * If nothing is returned, emit a U+0026 AMPERSAND character token.
+         * 
+         * Otherwise, emit the character token that was returned.
+         */
+        // Handled by consumeEntity()
+        /*
+         * Finally, switch to the data state.
+         */
+        return;
+    }
+
+    /**
+     * Tag open state
+     * 
+     * @throws IOException
+     * @throws SAXException
+     */
+    private void tagOpenState() throws SAXException, IOException {
+        /*
+         * The behaviour of this state depends on the content model flag.
+         */
+        // this can't happen in PLAINTEXT, so using not PCDATA as the condition
+        if (contentModelFlag != ContentModelFlag.PCDATA) {
+            /*
+             * If the content model flag is set to the RCDATA or CDATA states
+             * Consume the next input character.
+             */
+            char c = read();
+            if (c == '/') {
+                /*
+                 * If it is a U+002F SOLIDUS (/) character, switch to the close
+                 * tag open state.
+                 */
+                closeTagOpenState();
+                return;
+            } else {
+                /*
+                 * Otherwise, emit a U+003C LESS-THAN SIGN character token
+                 */
+                tokenHandler.characters(LT_GT, 0, 1);
+                /*
+                 * and reconsume the current input character in the data state.
+                 */
+                unread(c);
+                return;
+            }
+        } else {
+            /*
+             * If the content model flag is set to the PCDATA state Consume the
+             * next input character:
+             */
+            char c = read();
+            if (c == '!') {
+                /*
+                 * U+0021 EXCLAMATION MARK (!) Switch to the markup declaration
+                 * open state.
+                 */
+                markupDeclarationOpenState();
+                return;
+            } else if (c == '/') {
+                /* U+002F SOLIDUS (/) Switch to the close tag open state. */
+                closeTagOpenState();
+                return;
+            } else if (c >= 'A' && c <= 'Z') {
+                /*
+                 * U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL
+                 * LETTER Z Create a new start tag token,
+                 */
+                endTag = false;
+                /*
+                 * set its tag name to the lowercase version of the input
+                 * character (add 0x0020 to the character's code point),
+                 */
+                clearStrBuf();
+                appendStrBuf((char) (c + 0x20));
+                /* then switch to the tag name state. */
+                tagNameState();
+                /*
+                 * (Don't emit the token yet; further details will be filled in
+                 * before it is emitted.)
+                 */
+                return;
+            } else if (c >= 'a' && c <= 'z') {
+                /*
+                 * U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL
+                 * LETTER Z Create a new start tag token,
+                 */
+                endTag = false;
+                /*
+                 * set its tag name to the input character,
+                 */
+                clearStrBuf();
+                appendStrBuf(c);
+                /* then switch to the tag name state. */
+                tagNameState();
+                /*
+                 * (Don't emit the token yet; further details will be filled in
+                 * before it is emitted.)
+                 */
+                return;
+            } else if (c == '>') {
+                /*
+                 * U+003E GREATER-THAN SIGN (>) Parse error.
+                 */
+                err("Bad character \u201C>\u201D in the tag open state.");
+                /*
+                 * Emit a U+003C LESS-THAN SIGN character token and a U+003E
+                 * GREATER-THAN SIGN character token.
+                 */
+                tokenHandler.characters(LT_GT, 0, 2);
+                /* Switch to the data state. */
+                return;
+            } else if (c == '?') {
+                /*
+                 * U+003F QUESTION MARK (?) Parse error.
+                 */
+                err("Bad character \u201C?\u201D in the tag open state.");
+                /*
+                 * Switch to the bogus comment state.
+                 */
+                clearLongStrBuf();
+                appendLongStrBuf(c);
+                bogusCommentState();
+                return;
+            } else {
+                /*
+                 * Anything else Parse error.
+                 */
+                err("Bad character \u201C" + c
+                        + "\u201D in the tag open state.");
+                /*
+                 * Emit a U+003C LESS-THAN SIGN character token
+                 */
+                tokenHandler.characters(LT_GT, 0, 1);
+                /*
+                 * and reconsume the current input character in the data state.
+                 */
+                unread(c);
+                return;
+            }
+        }
+    }
+
+    /**
+     * Close tag open state
+     * 
+     * @throws IOException
+     * @throws SAXException
+     */
+    private void closeTagOpenState() throws SAXException, IOException {
+        // this can't happen in PLAINTEXT, so using not PCDATA as the condition
+        if (contentModelFlag != ContentModelFlag.PCDATA
+                && contentModelElement != null) {
+            /*
+             * If the content model flag is set to the RCDATA or CDATA states
+             * but no start tag token has ever been emitted by this instance of
+             * the tokeniser (fragment case), or, if the content model flag is
+             * set to the RCDATA or CDATA states and the next few characters do
+             * not match the tag name of the last start tag token emitted (case
+             * insensitively), or if they do but they are not immediately
+             * followed by one of the following characters: + U+0009 CHARACTER
+             * TABULATION + U+000A LINE FEED (LF) + U+000B LINE TABULATION +
+             * U+000C FORM FEED (FF) + U+0020 SPACE + U+003E GREATER-THAN SIGN
+             * (>) + U+002F SOLIDUS (/) + EOF
+             * 
+             * ...then emit a U+003C LESS-THAN SIGN character token, a U+002F
+             * SOLIDUS character token, and switch to the data state to process
+             * the next input character.
+             */
+            // Let's implement the above without lookahead. strBuf holds
+            // characters that need to be emitted if looking for an end tag
+            // fails.
+            // Duplicating the relevant part of tag name state here as well.
+            clearStrBuf();
+            for (int i = 0; i < contentModelElement.length(); i++) {
+                char e = contentModelElement.charAt(i);
+                char c = read();
+                char folded = c;
+                if (c >= 'A' && c <= 'Z') {
+                    folded += 0x20;
+                }
+                if (folded != e) {
+                    if (i > 0 || (folded >= 'a' && folded <= 'z')) {
+                        if (html4) {
+                            err((contentModelFlag == ContentModelFlag.CDATA ? "CDATA"
+                                    : "RCDATA")
+                                    + " element \u201C"
+                                    + contentModelElement
+                                    + "\u201D contained the string \u201C</\u201D, but it was not the start of the end tag. (HTML4-only error)");
+                        } else {
+                            warn((contentModelFlag == ContentModelFlag.CDATA ? "CDATA"
+                                    : "RCDATA")
+                                    + " element \u201C"
+                                    + contentModelElement
+                                    + "\u201D contained the string \u201C</\u201D, but this did not close the element.");
+                        }
+                    }
+                    tokenHandler.characters(LT_SOLIDUS, 0, 2);
+                    emitStrBuf();
+                    unread(c);
+                    return;
+                }
+                appendStrBuf(c);
+            }
+            endTag = true;
+            tagName = contentModelElement;
+            char c = read();
+            switch (c) {
+                case ' ':
+                case '\t':
+                case '\n':
+                case '\u000B':
+                case '\u000C':
+                    /*
+                     * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
+                     * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Switch
+                     * to the before attribute name state.
+                     */
+                    beforeAttributeNameState();
+                    return;
+                case '>':
+                    /* U+003E GREATER-THAN SIGN (>) Emit the current tag token. */
+                    emitCurrentTagToken();
+                    /*
+                     * Switch to the data state.
+                     */
+                    return;
+                case '\u0000':
+                    /*
+                     * EOF Parse error.
+                     */
+                    err("Expected \u201C>\u201D but saw end of file instead.");
+                    /*
+                     * Emit the current tag token.
+                     */
+                    emitCurrentTagToken();
+                    /* Reconsume the character in the data state. */
+                    unread(c);
+                    return;
+                case '/':
+                    /*
+                     * U+002F SOLIDUS (/) Parse error unless this is a permitted
+                     * slash.
+                     */
+                    // never permitted here
+                    err("Stray \u201C/\u201D in end tag.");
+                    /* Switch to the before attribute name state. */
+                    beforeAttributeNameState();
+                    return;
+                default:
+                    if (html4) {
+                        err((contentModelFlag == ContentModelFlag.CDATA ? "CDATA"
+                                : "RCDATA")
+                                + " element \u201C"
+                                + contentModelElement
+                                + "\u201D contained the string \u201C</\u201D, but it was not the start of the end tag. (HTML4-only error)");
+                    } else {
+                        warn((contentModelFlag == ContentModelFlag.CDATA ? "CDATA"
+                                : "RCDATA")
+                                + " element \u201C"
+                                + contentModelElement
+                                + "\u201D contained the string \u201C</\u201D, but this did not close the element.");
+                    }
+                    tokenHandler.characters(LT_SOLIDUS, 0, 2);
+                    emitStrBuf();
+                    cstart = pos; // don't drop the character
+                    return;
+            }
+        } else {
+            /*
+             * Otherwise, if the content model flag is set to the PCDATA state,
+             * or if the next few characters do match that tag name, consume the
+             * next input character:
+             */
+            char c = read();
+            if (c >= 'A' && c <= 'Z') {
+                /*
+                 * U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL
+                 * LETTER Z Create a new end tag token,
+                 */
+                endTag = true;
+                clearStrBuf();
+                /*
+                 * set its tag name to the lowercase version of the input
+                 * character (add 0x0020 to the character's code point),
+                 */
+                appendStrBuf((char) (c + 0x20));
+                /*
+                 * then switch to the tag name state. (Don't emit the token yet;
+                 * further details will be filled in before it is emitted.)
+                 */
+                tagNameState();
+                return;
+            } else if (c >= 'a' && c <= 'z') {
+                /*
+                 * U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL
+                 * LETTER Z Create a new end tag token,
+                 */
+                endTag = true;
+                clearStrBuf();
+                /*
+                 * set its tag name to the input character,
+                 */
+                appendStrBuf(c);
+                /*
+                 * then switch to the tag name state. (Don't emit the token yet;
+                 * further details will be filled in before it is emitted.)
+                 */
+                tagNameState();
+                return;
+            } else if (c == '>') {
+                /* U+003E GREATER-THAN SIGN (>) Parse error. */
+                err("Saw \u201C</>\u201D.");
+                /*
+                 * Switch to the data state.
+                 */
+                return;
+            } else if (c == '\u0000') {
+                /* EOF Parse error. */
+                err("Saw \u201C</\u201D immediately before end of file.");
+                /*
+                 * Emit a U+003C LESS-THAN SIGN character token and a U+002F
+                 * SOLIDUS character token.
+                 */
+                tokenHandler.characters(LT_SOLIDUS, 0, 2);
+                /*
+                 * Reconsume the EOF character in the data state.
+                 */
+                unread(c);
+                return;
+            } else {
+                /* Anything else Parse error. */
+                err("Garbage after \u201C</\u201D.");
+                /*
+                 * Switch to the bogus comment state.
+                 */
+                clearLongStrBuf();
+                appendToComment(c);
+                bogusCommentState();
+                return;
+            }
+        }
+    }
+
+    /**
+     * Tag name state
+     * 
+     * @throws IOException
+     * @throws SAXException
+     */
+    private void tagNameState() throws SAXException, IOException {
+        for (;;) {
+            /*
+             * Consume the next input character:
+             */
+            char c = read();
+            switch (c) {
+                case ' ':
+                case '\t':
+                case '\n':
+                case '\u000B':
+                case '\u000C':
+                    /*
+                     * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
+                     * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Switch
+                     * to the before attribute name state.
+                     */
+                    tagName = strBufToElementNameString();
+                    beforeAttributeNameState();
+                    return;
+                case '>':
+                    /* U+003E GREATER-THAN SIGN (>) Emit the current tag token. */
+                    tagName = strBufToElementNameString();
+                    emitCurrentTagToken();
+                    /*
+                     * Switch to the data state.
+                     */
+                    return;
+                case '\u0000':
+                    /*
+                     * EOF Parse error.
+                     */
+                    err("End of file seen when looking for tag name");
+                    /*
+                     * Emit the current tag token.
+                     */
+                    tagName = strBufToElementNameString();
+                    emitCurrentTagToken();
+                    /*
+                     * Reconsume the EOF character in the data state.
+                     */
+                    unread(c);
+                    return;
+                case '/':
+                    /*
+                     * U+002F SOLIDUS (/) Parse error unless this is a permitted
+                     * slash.
+                     */
+                    tagName = strBufToElementNameString();
+                    parseErrorUnlessPermittedSlash();
+                    /*
+                     * Switch to the before attribute name state.
+                     */
+                    beforeAttributeNameState();
+                    return;
+                default:
+                    if (c >= 'A' && c <= 'Z') {
+                        /*
+                         * U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN
+                         * CAPITAL LETTER Z Append the lowercase version of the
+                         * current input character (add 0x0020 to the
+                         * character's code point) to the current tag token's
+                         * tag name.
+                         */
+                        appendStrBuf((char) (c + 0x20));
+                    } else {
+                        /*
+                         * Anything else Append the current input character to
+                         * the current tag token's tag name.
+                         */
+                        appendStrBuf(c);
+                    }
+                    /*
+                     * Stay in the tag name state.
+                     */
+                    continue;
+            }
+        }
+    }
+
+    private String strBufToElementNameString() {
+        // TODO Generate a better interning function
+        return strBufToString().intern();
+    }
+
+    /**
+     * This method implements a wrapper loop for the attribute-related states to
+     * avoid recursion to an arbitrary depth.
+     * 
+     * @throws IOException
+     * @throws SAXException
+     */
+    private void beforeAttributeNameState() throws SAXException, IOException {
+        while (beforeAttributeNameStateImpl()) {
+            // Spin.
+        }
+    }
+
+    /**
+     * 
+     */
+    private void resetAttributes() {
+        attributes = null; // XXX figure out reuse
+    }
+
+    /**
+     * Before attribute name state
+     * 
+     * @throws IOException
+     * @throws SAXException
+     */
+    private boolean beforeAttributeNameStateImpl() throws SAXException,
+            IOException {
+        /*
+         * Consume the next input character:
+         */
+        for (;;) {
+            char c = read();
+            switch (c) {
+                case ' ':
+                case '\t':
+                case '\n':
+                case '\u000B':
+                case '\u000C':
+                    /*
+                     * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
+                     * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Stay
+                     * in the before attribute name state.
+                     */
+                    continue;
+                case '>':
+                    /*
+                     * U+003E GREATER-THAN SIGN (>) Emit the current tag token.
+                     */
+                    emitCurrentTagToken();
+                    /*
+                     * Switch to the data state.
+                     */
+                    return false;
+                case '/':
+                    /*
+                     * U+002F SOLIDUS (/) Parse error unless this is a permitted
+                     * slash.
+                     */
+                    parseErrorUnlessPermittedSlash();
+                    /*
+                     * Stay in the before attribute name state.
+                     */
+                    continue;
+                case '\u0000':
+                    /* EOF Parse error. */
+                    err("Saw end of file without the previous tag ending with \u201C>\u201C.");
+                    /*
+                     * Emit the current tag token.
+                     */
+                    emitCurrentTagToken();
+                    /*
+                     * Reconsume the EOF character in the data state.
+                     */
+                    unread(c);
+                    return false;
+                default:
+                    /*
+                     * Anything else Start a new attribute in the current tag
+                     * token.
+                     */
+                    clearStrBuf();
+
+                    if (c >= 'A' && c <= 'Z') {
+                        /*
+                         * U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN
+                         * CAPITAL LETTER Z Set that attribute's name to the
+                         * lowercase version of the current input character (add
+                         * 0x0020 to the character's code point)
+                         */
+                        appendStrBuf((char) (c + 0x20));
+                    } else {
+                        /*
+                         * Set that attribute's name to the current input
+                         * character,
+                         */
+                        appendStrBuf(c);
+                    }
+                    /*
+                     * and its value to the empty string.
+                     */
+                    // Will do later.
+                    /*
+                     * Switch to the attribute name state.
+                     */
+                    return attributeNameState();
+            }
+        }
+    }
+
+    private void parseErrorUnlessPermittedSlash() throws SAXException,
+            IOException {
+        /*
+         * A permitted slash is a U+002F SOLIDUS character that is immediately
+         * followed by a U+003E GREATER-THAN SIGN, if, and only if, the current
+         * token being processed is a start tag token whose tag name is one of
+         * the following: base, link, meta, hr, br, img, embed, param, area,
+         * col, input
+         */
+        if (endTag) {
+            err("Stray \u201C/\u201D in an end tag.");
+            return;
+        }
+        char c = read();
+        if (c == '>') {
+            if (!currentIsVoid() && !html4) {
+                if (html4) {
+                    err("Stray \u201C/\u201D in tag. The \u201C/>\u201D syntax is not permitted in HTML4.");
+                } else {
+                    err("Stray \u201C/\u201D in tag. The \u201C/>\u201D syntax is only permitted on void elements.");
+                }
+            } else if (html4) {
+                err("Stray \u201C/\u201D in tag. The \u201C/>\u201D syntax is not permitted in HTML4. (HTML4-only error)");
+            }
+        } else {
+            err("Stray \u201C/\u201D in tag.");
+        }
+        unread(c);
+    }
+
+    private void emitCurrentTagToken() throws SAXException {
+        if (namePolicy != XmlViolationPolicy.ALLOW) {
+            if (!isNcname(tagName)) {
+                if (namePolicy == XmlViolationPolicy.FATAL) {
+                    fatal((endTag ? "End": "Start") + " tag \u201C" + tagName + "\u201D has a non-NCName name.");
+                } else {
+                    warn((endTag ? "End": "Start") + " tag \u201C" + tagName + "\u201D has a non-NCName name. Ignoring token.");                    
+                    return;
+                }
+            }
+        }
+        Attributes attrs = (attributes == null ? EmptyAttributes.EMPTY_ATTRIBUTES
+                : attributes);
+        if (endTag) {
+            /*
+             * When an end tag token is emitted, the content model flag must be
+             * switched to the PCDATA state.
+             */
+            escapeFlag = false;
+            contentModelFlag = ContentModelFlag.PCDATA;
+            if (attrs.getLength() != 0) {
+                /*
+                 * When an end tag token is emitted with attributes, that is a
+                 * parse error.
+                 */
+                err("End tag had attributes.");
+            }
+            tokenHandler.endTag(tagName, attrs);
+        } else {
+            tokenHandler.startTag(tagName, attrs);
+        }
+    }
+
+    /**
+     * Attribute name state
+     * 
+     * @throws IOException
+     * @throws SAXException
+     */
+    private boolean attributeNameState() throws SAXException, IOException {
+        for (;;) {
+            /*
+             * Consume the next input character:
+             */
+            char c = read();
+            switch (c) {
+                case ' ':
+                case '\t':
+                case '\n':
+                case '\u000B':
+                case '\u000C':
+                    /*
+                     * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
+                     * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Switch
+                     * to the after attribute name state.
+                     */
+                    attributeNameComplete();
+                    return afterAttributeNameState();
+                case '=':
+                    /*
+                     * U+003D EQUALS SIGN (=) Switch to the before attribute
+                     * value state.
+                     */
+                    attributeNameComplete();
+                    return beforeAttributeValueState();
+                case '>':
+                    /* U+003E GREATER-THAN SIGN (>) Emit the current tag token. */
+                    attributeNameComplete();
+                    addAttributeWithoutValue();
+                    emitCurrentTagToken();
+                    /*
+                     * Switch to the data state.
+                     */
+                    return false;
+                case '/':
+                    /*
+                     * U+002F SOLIDUS (/) Parse error unless this is a permitted
+                     * slash.
+                     */
+                    parseErrorUnlessPermittedSlash();
+                    /* Switch to the before attribute name state. */
+                    attributeNameComplete();
+                    addAttributeWithoutValue();
+                    return true;
+                case '\u0000':
+                    /*
+                     * EOF Parse error.
+                     */
+                    err("End of file occurred in an attribute name.");
+                    /*
+                     * Emit the current tag token.
+                     */
+                    attributeNameComplete();
+                    addAttributeWithoutValue();
+                    emitCurrentTagToken();
+                    /* Reconsume the EOF character in the data state. */
+                    unread(c);
+                    return false;
+                default:
+                    if (c >= 'A' && c <= 'Z') {
+                        /*
+                         * U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN
+                         * CAPITAL LETTER Z Append the lowercase version of the
+                         * current input character (add 0x0020 to the
+                         * character's code point) to the current attribute's
+                         * name.
+                         */
+                        appendStrBuf((char) (c + 0x20));
+                    } else {
+                        /*
+                         * Anything else Append the current input character to
+                         * the current attribute's name.
+                         */
+                        appendStrBuf(c);
+                    }
+            }
+            /*
+             * Stay in the attribute name state.
+             */
+            continue;
+        }
+    }
+
+    private void attributeNameComplete() throws SAXException {
+        attributeName = strBufToString();
+        if (attributes == null) {
+            attributes = newAttributes();
+        }
+        /*
+         * When the user agent leaves the attribute name state (and before
+         * emitting the tag token, if appropriate), the complete attribute's
+         * name must be compared to the other attributes on the same token; if
+         * there is already an attribute on the token with the exact same name,
+         * then this is a parse error and the new attribute must be dropped,
+         * along with the value that gets associated with it (if any).
+         */
+        if (attributes.getIndex(attributeName) == -1) {
+            if (namePolicy == XmlViolationPolicy.ALLOW) {
+                shouldAddAttributes = true;                
+            } else {
+                if (isNcname(attributeName)) {
+                    shouldAddAttributes = true;                                    
+                } else {
+                    if (namePolicy == XmlViolationPolicy.FATAL) {
+                        fatal("Attribute name \u201C" + attributeName + "\u201D is not an NCName.");
+                    } else {
+                        shouldAddAttributes = false;
+                        warn("Attribute name \u201C" + attributeName + "\u201D is not an NCName. Ignoring the attribute.");
+                    }
+                }
+            }
+        } else {
+            shouldAddAttributes = false;
+            err("Duplicate attribute \u201C" + attributeName + "\u201D.");
+        }
+    }
+
+    private void addAttributeWithoutValue() throws SAXException {
+        if (metaBoundaryPassed && "charset".equals(attributeName)
+                && "meta".equals(tagName)) {
+            err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
+        }
+        if (shouldAddAttributes) {
+            if (html4) {
+                if (AttributeInfo.isBoolean(attributeName)) {
+                    if (html4ModeCompatibleWithXhtml1Schemata) {
+                        attributes.addAttribute(attributeName, attributeName);
+                    } else {
+                        attributes.addAttribute(attributeName, "");
+                    }
+                } else {
+                    err("Attribute value omitted for a non-boolean attribute. (HTML4-only error.)");
+                    attributes.addAttribute(attributeName, "");
+                }
+            } else {
+                if ("src".equals(attributeName) || "href".equals(attributeName)) {
+                    warn("Attribute \u201C" + attributeName + "\u201D without an explicit value seen. The attribute may be dropped by IE7.");
+                }
+                attributes.addAttribute(attributeName, "");
+            }
+        }
+    }
+
+    private void addAttributeWithValue() throws SAXException {
+        if (metaBoundaryPassed && "meta" == tagName
+                && "charset".equals(attributeName)) {
+            err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
+        }
+        if (shouldAddAttributes) {
+            String value = longStrBufToString();
+            if (!endTag) {
+                if ("xmlns".equals(attributeName)) {
+                    if ("html" == tagName
+                            && "http://www.w3.org/1999/xhtml".equals(value)) {
+                        if (xmlnsPolicy == XmlViolationPolicy.ALTER_INFOSET) {
+                            return;
+                        }
+                    } else {
+                        if (bogusXmlnsPolicy == XmlViolationPolicy.FATAL) {
+                            fatal("Forbidden attribute \u201C" + attributeName + "\u201D is not mappable to namespace-aware XML 1.0.");
+                        } else {
+                            warn("Forbidden attribute \u201C" + attributeName + "\u201D is not mappable to namespace-aware XML 1.0.");
+                            if (bogusXmlnsPolicy == XmlViolationPolicy.ALTER_INFOSET) {
+                                return;
+                            }
+                        }
+                    }
+                } else if (attributeName.startsWith("xmlns:")) {
+                    if (bogusXmlnsPolicy == XmlViolationPolicy.FATAL) {
+                        fatal("Forbidden attribute \u201C" + attributeName + "\u201D is not mappable to namespace-aware XML 1.0.");
+                    } else {
+                        warn("Forbidden attribute \u201C" + attributeName + "\u201D is not mappable to namespace-aware XML 1.0.");
+                        if (bogusXmlnsPolicy == XmlViolationPolicy.ALTER_INFOSET) {
+                            return;
+                        }
+                    }                    
+                }
+            }
+            attributes.addAttribute(attributeName, value);
+        }
+    }
+
+    /**
+     * After attribute name state
+     * 
+     * @throws IOException
+     * @throws SAXException
+     */
+    private boolean afterAttributeNameState() throws SAXException, IOException {
+        for (;;) {
+            /*
+             * Consume the next input character:
+             */
+            char c = read();
+            switch (c) {
+                case ' ':
+                case '\t':
+                case '\n':
+                case '\u000B':
+                case '\u000C':
+                    /*
+                     * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
+                     * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Stay
+                     * in the after attribute name state.
+                     */
+                    continue;
+                case '=':
+                    /*
+                     * U+003D EQUALS SIGN (=) Switch to the before attribute
+                     * value state.
+                     */
+                    return beforeAttributeValueState();
+                case '>':
+                    /*
+                     * U+003E GREATER-THAN SIGN (>) Emit the current tag token.
+                     */
+                    addAttributeWithoutValue();
+                    emitCurrentTagToken();
+                    /*
+                     * Switch to the data state.
+                     */
+                    return false;
+                case '/':
+                    /*
+                     * U+002F SOLIDUS (/) Parse error unless this is a permitted
+                     * slash.
+                     */
+                    addAttributeWithoutValue();
+                    parseErrorUnlessPermittedSlash();
+                    /* Switch to the before attribute name state. */
+                    return true;
+                case '\u0000':
+                    /* EOF Parse error. */
+                    err("Saw end of file without the previous tag ending with \u201C>\u201C.");
+                    /*
+                     * Emit the current tag token.
+                     */
+                    addAttributeWithoutValue();
+                    emitCurrentTagToken();
+                    /*
+                     * Reconsume the character in the data state.
+                     */
+                    unread(c);
+                    return false;
+                default:
+                    /*
+                     * U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN
+                     * CAPITAL LETTER Z Start a new attribute in the current tag
+                     * token. Set that attribute's name to the lowercase version
+                     * of the current input character (add 0x0020 to the
+                     * character's code point), and its value to the empty
+                     * string. Switch to the attribute name state.
+                     * 
+                     * Anything else Start a new attribute in the current tag
+                     * token. Set that attribute's name to the current input
+                     * character, and its value to the empty string. Switch to
+                     * the attribute name state.
+                     */
+                    // let's do this by respinning through the attribute loop
+                    addAttributeWithoutValue();
+                    unread(c);
+                    return true;
+            }
+        }
+    }
+
+    /**
+     * Before attribute value state
+     * 
+     * @throws IOException
+     * @throws SAXException
+     */
+    private boolean beforeAttributeValueState() throws SAXException,
+            IOException {
+        clearLongStrBuf();
+        for (;;) {
+            /*
+             * Consume the next input character:
+             */
+            char c = read();
+            switch (c) {
+                case ' ':
+                case '\t':
+                case '\n':
+                case '\u000B':
+                case '\u000C':
+                    /*
+                     * U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000B
+                     * LINE TABULATION U+000C FORM FEED (FF) U+0020 SPACE Stay
+                     * in the before attribute value state.
+                     */
+                    continue;
+                case '"':
+                    /*
+                     * U+0022 QUOTATION MARK (") Switch to the attribute value
+                     * (double-quoted) state.
+                     */
+                    return attributeValueDoubleQuotedState();
+                case '&':
+                    /*
+                     * U+0026 AMPERSAND (&) Switch to the attribute value
+                     * (unquoted) state and reconsume this input character.
+                     */
+                    unread(c);
+                    return attributeValueUnquotedState();
+                case '\'':
+                    /*
+                     * U+0027 APOSTROPHE (') Switch to the attribute value
+                     * (single-quoted) state.
+                     */
+                    return attributeValueSingleQuotedState();
+                case '>':
+                    /* U+003E GREATER-THAN SIGN (>) Emit the current tag token. */
+                    addAttributeWithoutValue();
+                    emitCurrentTagToken();
+                    /*
+                     * Switch to the data state.
+                     */
+                    return false;
+                case '\u0000':
+                    /* EOF Parse error. */
+                    err("Saw end of file without the previous tag ending with \u201C>\u201C.");
+                    /*
+                     * Emit the current tag token.
+                     */
+                    addAttributeWithoutValue();
+                    emitCurrentTagToken();
+                    /*
+                     * Reconsume the character in the data state.
+                     */
+                    unread(c);
+                    return false;
+                default:
+                    if (html4
+                            && !((c >= 'a' && c <= 'z')
+                                    || (c >= 'A' && c <= 'Z')
+                                    || (c >= '0' && c <= '9') || c == '.'
+                                    || c == '-' || c == '_' || c == ':')) {
+                        err("Non-name character in an unquoted attribute value. (This is an HTML4-only error.)");
+                    }
+                    /*
+                     * Anything else Append the current input character to the
+                     * current attribute's value.
+                     */
+                    appendLongStrBuf(c);
+                    /*
+                     * Switch to the attribute value (unquoted) state.
+                     */
+                    return attributeValueUnquotedState();
+            }
+        }
+    }
+
+    /**
+     * Attribute value (double-quoted) state
+     * 
+     * @throws IOException
+     * @throws SAXException
+     */
+    private boolean attributeValueDoubleQuotedState() throws SAXException,

[... 1839 lines stripped ...]


Mime
View raw message