abdera-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jmsn...@apache.org
Subject svn commit: r587550 [1/6] - in /incubator/abdera/java/trunk/extensions/json/src/main: java/nu/ java/nu/validator/ java/nu/validator/htmlparser/ java/nu/validator/htmlparser/common/ java/nu/validator/htmlparser/impl/ java/nu/validator/htmlparser/sax/ ja...
Date Tue, 23 Oct 2007 16:28:58 GMT
Author: jmsnell
Date: Tue Oct 23 09:28:51 2007
New Revision: 587550

URL: http://svn.apache.org/viewvc?rev=587550&view=rev
Log:
updated json serialization support including a subset of the nu.validator html parser for
producing hashed html output

Added:
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/common/
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/common/DoctypeExpectation.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/common/DocumentMode.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/common/DocumentModeHandler.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/common/XmlViolationPolicy.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/common/package.html
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/AttributeInfo.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/AttributesImpl.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/BomSniffer.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/ByteReadable.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/ContentModelFlag.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/EmptyAttributes.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/EncodingInfo.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/Entities.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/HtmlInputStreamReader.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/MetaSniffer.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/TokenHandler.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/Tokenizer.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/TreeBuilder.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/XmlLangAttributesImpl.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/package.html
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/sax/
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/sax/HtmlParser.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/sax/HtmlSerializer.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/sax/SAXStreamer.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/sax/SAXTreeBuilder.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/sax/package.html
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/saxtree/
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/saxtree/CDATA.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/saxtree/CharBufferNode.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/saxtree/Characters.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/saxtree/Comment.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/saxtree/DTD.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/saxtree/Document.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/saxtree/DocumentFragment.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/saxtree/Element.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/saxtree/Entity.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/saxtree/IgnorableWhitespace.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/saxtree/LocatorImpl.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/saxtree/Node.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/saxtree/NodeType.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/saxtree/NullLexicalHandler.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/saxtree/ParentNode.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/saxtree/PrefixMapping.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/saxtree/ProcessingInstruction.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/saxtree/SkippedEntity.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/saxtree/TreeBuilder.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/saxtree/TreeParser.java
    incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/saxtree/package.html
    incubator/abdera/java/trunk/extensions/json/src/main/java/org/apache/abdera/ext/html/
    incubator/abdera/java/trunk/extensions/json/src/main/java/org/apache/abdera/ext/html/HtmlHelper.java
    incubator/abdera/java/trunk/extensions/json/src/main/resources/META-INF/LICENSE.htmlparser.txt
    incubator/abdera/java/trunk/extensions/json/src/main/resources/META-INF/LICENSE.serializer.txt
    incubator/abdera/java/trunk/extensions/json/src/main/resources/META-INF/NOTICE.htmlparser.txt
    incubator/abdera/java/trunk/extensions/json/src/main/resources/META-INF/NOTICE.serializer.txt
Modified:
    incubator/abdera/java/trunk/extensions/json/src/main/java/org/apache/abdera/ext/json/JSONUtil.java

Added: incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/common/DoctypeExpectation.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/common/DoctypeExpectation.java?rev=587550&view=auto
==============================================================================
--- incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/common/DoctypeExpectation.java (added)
+++ incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/common/DoctypeExpectation.java Tue Oct 23 09:28:51 2007
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2007 Henri Sivonen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a 
+ * copy of this software and associated documentation files (the "Software"), 
+ * to deal in the Software without restriction, including without limitation 
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+ * and/or sell copies of the Software, and to permit persons to whom the 
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+package nu.validator.htmlparser.common;
+
+/**
+ * Used for indicating desired behavior with legacy doctypes.
+ * 
+ * @version $Id: DoctypeExpectation.java 150 2007-08-16 19:21:25Z hsivonen $
+ * @author hsivonen
+ */
+public enum DoctypeExpectation {
+    /**
+     * Be a pure HTML5 parser.
+     */
+    HTML,
+    
+    /**
+     * Require the HTML 4.01 Transitional public id. Turn on HTML4-specific
+     * additional errors regardless of doctype.
+     */
+    HTML401_TRANSITIONAL,
+    
+    /**
+     * Require the HTML 4.01 Transitional public id and a system id. Turn on
+     * HTML4-specific additional errors regardless of doctype.
+     */
+    HTML401_STRICT,
+    
+    /**
+     * Treat the doctype required by HTML 5, doctypes with the HTML 4.01 Strict 
+     * public id and doctypes with the HTML 4.01 Transitional public id and a 
+     * system id as non-errors. Turn on HTML4-specific additional errors if the 
+     * public id is the HTML 4.01 Strict or Transitional public id.
+     */
+    AUTO,
+    
+    /**
+     * Never enable HTML4-specific error checks. Never report any doctype 
+     * condition as an error. (Doctype tokens in wrong places will be 
+     * reported as errors, though.) The application may decide what to log 
+     * in response to calls to <code>DocumentModeHanler</code>. This mode 
+     * in meant for doing surveys on existing content.
+     */
+    NO_DOCTYPE_ERRORS
+}

Added: incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/common/DocumentMode.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/common/DocumentMode.java?rev=587550&view=auto
==============================================================================
--- incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/common/DocumentMode.java (added)
+++ incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/common/DocumentMode.java Tue Oct 23 09:28:51 2007
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2007 Henri Sivonen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a 
+ * copy of this software and associated documentation files (the "Software"), 
+ * to deal in the Software without restriction, including without limitation 
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+ * and/or sell copies of the Software, and to permit persons to whom the 
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+package nu.validator.htmlparser.common;
+
+public enum DocumentMode {
+STANDARDS_MODE,
+ALMOST_STANDARDS_MODE,
+QUIRKS_MODE
+}

Added: incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/common/DocumentModeHandler.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/common/DocumentModeHandler.java?rev=587550&view=auto
==============================================================================
--- incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/common/DocumentModeHandler.java (added)
+++ incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/common/DocumentModeHandler.java Tue Oct 23 09:28:51 2007
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2007 Henri Sivonen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a 
+ * copy of this software and associated documentation files (the "Software"), 
+ * to deal in the Software without restriction, including without limitation 
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+ * and/or sell copies of the Software, and to permit persons to whom the 
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+package nu.validator.htmlparser.common;
+
+
+import org.xml.sax.SAXException;
+
+public interface DocumentModeHandler {
+    public void documentMode(DocumentMode mode, String publicIdentifier, String systemIdentifier, boolean html4SpecificAdditionalErrorChecks) throws SAXException;
+}

Added: incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/common/XmlViolationPolicy.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/common/XmlViolationPolicy.java?rev=587550&view=auto
==============================================================================
--- incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/common/XmlViolationPolicy.java (added)
+++ incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/common/XmlViolationPolicy.java Tue Oct 23 09:28:51 2007
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2007 Henri Sivonen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a 
+ * copy of this software and associated documentation files (the "Software"), 
+ * to deal in the Software without restriction, including without limitation 
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+ * and/or sell copies of the Software, and to permit persons to whom the 
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+package nu.validator.htmlparser.common;
+
+/**
+ * Policy for XML 1.0 violations.
+ * 
+ * @version $Id: XmlViolationPolicy.java 150 2007-08-16 19:21:25Z hsivonen $
+ * @author hsivonen
+ */
+public enum XmlViolationPolicy {
+    /**
+     * Conform to HTML 5, allow XML 1.0 to be violated.
+     */
+    ALLOW,
+    
+    /**
+     * Halt when something cannot be mapped to XML 1.0.
+     */
+    FATAL,
+    
+    /**
+     * Be non-conforming and alter the infoset to fit 
+     * XML 1.0 when something would otherwise not be 
+     * mappable to XML 1.0.
+     */
+    ALTER_INFOSET
+}

Added: incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/common/package.html
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/common/package.html?rev=587550&view=auto
==============================================================================
--- incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/common/package.html (added)
+++ incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/common/package.html Tue Oct 23 09:28:51 2007
@@ -0,0 +1,29 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
+<html>
+<head><title>Package Overview</title>
+<!--
+ Copyright (c) 2007 Henri Sivonen
+
+ Permission is hereby granted, free of charge, to any person obtaining a 
+ copy of this software and associated documentation files (the "Software"), 
+ to deal in the Software without restriction, including without limitation 
+ the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+ and/or sell copies of the Software, and to permit persons to whom the 
+ Software is furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in 
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ DEALINGS IN THE SOFTWARE.
+-->
+</head>
+<body bgcolor="white">
+<p>This package provides common interfaces and enumerations.</p>
+</body>
+</html>
\ No newline at end of file

Added: incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/AttributeInfo.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/AttributeInfo.java?rev=587550&view=auto
==============================================================================
--- incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/AttributeInfo.java (added)
+++ incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/AttributeInfo.java Tue Oct 23 09:28:51 2007
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2005, 2006 Henri Sivonen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a 
+ * copy of this software and associated documentation files (the "Software"), 
+ * to deal in the Software without restriction, including without limitation 
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+ * and/or sell copies of the Software, and to permit persons to whom the 
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+package nu.validator.htmlparser.impl;
+
+import java.util.Arrays;
+
+/**
+ * This class will probably be used for backwards compat stuff.
+ * 
+ * @version $Id: AttributeInfo.java 150 2007-08-16 19:21:25Z hsivonen $
+ * @author hsivonen
+ */
+public class AttributeInfo {
+    private static final String[] BOOLEAN_ATTRIBUTES = { "active", "async",
+            "autofocus", "autosubmit", "checked", "compact", "declare",
+            "default", "defer", "disabled", "ismap", "multiple", "nohref",
+            "noresize", "noshade", "nowrap", "readonly", "required", "selected" };
+
+    private static final String[] CASE_FOLDED = { "active", "align", "async",
+            "autocomplete", "autofocus", "autosubmit", "checked", "clear",
+            "compact", "dataformatas", /* sic */
+            "declare", "default", "defer", "dir", "disabled", "enctype",
+            "frame", "ismap", "method", "multiple", "nohref", "noresize",
+            "noshade", "nowrap", "readonly", "replace", "required", "rules",
+            "scope", "scrolling", "selected", "shape", "step", "type",
+            "valign", "valuetype" };
+
+    public static boolean isBoolean(String name) {
+        return Arrays.binarySearch(BOOLEAN_ATTRIBUTES, name) > -1;
+    }
+
+    public static boolean isCaseFolded(String name) {
+        return Arrays.binarySearch(CASE_FOLDED, name) > -1;
+    }
+
+}

Added: incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/AttributesImpl.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/AttributesImpl.java?rev=587550&view=auto
==============================================================================
--- incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/AttributesImpl.java (added)
+++ incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/AttributesImpl.java Tue Oct 23 09:28:51 2007
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2007 Henri Sivonen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a 
+ * copy of this software and associated documentation files (the "Software"), 
+ * to deal in the Software without restriction, including without limitation 
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+ * and/or sell copies of the Software, and to permit persons to whom the 
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+package nu.validator.htmlparser.impl;
+
+import org.xml.sax.Attributes;
+
+/**
+ * Be careful with this class. QName is the name in from 
+ * HTML tokenization. Otherwise, please refer to the interface doc.
+ * 
+ * @version $Id: AttributesImpl.java 150 2007-08-16 19:21:25Z hsivonen $
+ * @author hsivonen
+ */
+public class AttributesImpl implements Attributes {
+
+    private int length = 0;
+
+    private int limit = 0;
+
+    private String[] array = new String[10]; // covers 98.3% of elements according to Hixie
+
+    public final int getIndex(String qName) {
+        for (int i = 0; i < limit; i += 2) {
+            if (array[i].equals(qName)) {
+                return i / 2;
+            }
+        }
+        return -1;
+    }
+
+    public int getIndex(String uri, String localName) {
+        if ("".equals(uri)) {
+            return getIndex(localName);
+        } else {
+            return -1;
+        }
+    }
+
+    public final int getLength() {
+        return length;
+    }
+
+    public final String getLocalName(int index) {
+        return getQName(index);
+    }
+
+    public final String getQName(int index) {
+        return index < length ? array[index * 2] : null;
+    }
+
+    public final String getType(int index) {
+        if (index < length) {
+            if ("id".equals(getQName(index))) {
+                return "ID";
+            } else {
+                return "CDATA";
+            }
+        } else {
+            return null;
+        }
+    }
+
+    public final String getType(String qName) {
+        int index = getIndex(qName);
+        if (index == -1) {
+            return null;
+        } else {
+            return getType(index);
+        }
+    }
+
+    public String getType(String uri, String localName) {
+        if ("".equals(uri)) {
+            return getType(localName);
+        } else {
+            return null;
+        }
+    }
+
+    public String getURI(int index) {
+        return index < length ? "" : null;
+    }
+
+    public final String getValue(int index) {
+        return index < length ? array[index * 2 + 1] : null;
+    }
+
+    public final String getValue(String qName) {
+        int index = getIndex(qName);
+        if (index == -1) {
+            return null;
+        } else {
+            return getValue(index);
+        }
+    }
+
+    public String getValue(String uri, String localName) {
+        if ("".equals(uri)) {
+            return getValue(localName);
+        } else {
+            return null;
+        }
+    }
+
+    public final void addAttribute(String name, String value) {
+        if (array.length == limit) {
+            String[] newArray = new String[array.length + 10]; // The first growth covers virtually 100% of elements according to Hixie
+            System.arraycopy(array, 0, newArray, 0, array.length);
+            array = newArray;
+        }
+        array[limit] = name;
+        array[limit + 1] = value;
+        length++;
+        limit += 2;
+    }
+
+}

Added: incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/BomSniffer.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/BomSniffer.java?rev=587550&view=auto
==============================================================================
--- incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/BomSniffer.java (added)
+++ incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/BomSniffer.java Tue Oct 23 09:28:51 2007
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2007 Henri Sivonen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a 
+ * copy of this software and associated documentation files (the "Software"), 
+ * to deal in the Software without restriction, including without limitation 
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+ * and/or sell copies of the Software, and to permit persons to whom the 
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+package nu.validator.htmlparser.impl;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+
+/**
+ * The BOM sniffing part of the HTML5 encoding sniffing algorithm.
+ * 
+ * @version $Id: BomSniffer.java 150 2007-08-16 19:21:25Z hsivonen $
+ * @author hsivonen
+ */
+public final class BomSniffer {
+    
+    private final ByteReadable source;
+
+    /**
+     * @param source
+     */
+    public BomSniffer(final ByteReadable source) {
+        this.source = source;
+    }
+    
+    CharsetDecoder sniff() throws IOException {
+        int b = source.readByte();
+        if (b == 0xEF) { // UTF-8
+            b = source.readByte();
+            if (b == 0xBB) {
+                b = source.readByte();
+                if (b == 0xBF) {
+//                    return new CharsetProviderICU().charsetForName("UTF-8").newDecoder();
+                    return Charset.forName("UTF-8").newDecoder();
+                } else {
+                    return null;
+                }
+            } else {
+                return null;
+            }
+        } else if (b == 0xFF) { // little-endian
+            b = source.readByte();
+            if (b == 0xFE) {
+                return Charset.forName("UTF-16LE").newDecoder();
+            } else {
+                return null;
+            }
+        } else if (b == 0xFE) { // big-endian UTF-16
+            b = source.readByte();
+            if (b == 0xFF) {
+                return Charset.forName("UTF-16BE").newDecoder();        
+            } else {
+                return null;
+            }
+        } else {
+            return null;            
+        }
+    }
+    
+}

Added: incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/ByteReadable.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/ByteReadable.java?rev=587550&view=auto
==============================================================================
--- incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/ByteReadable.java (added)
+++ incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/ByteReadable.java Tue Oct 23 09:28:51 2007
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2007 Henri Sivonen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a 
+ * copy of this software and associated documentation files (the "Software"), 
+ * to deal in the Software without restriction, including without limitation 
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+ * and/or sell copies of the Software, and to permit persons to whom the 
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+package nu.validator.htmlparser.impl;
+
+import java.io.IOException;
+
+public interface ByteReadable {
+  public int readByte() throws IOException;
+}

Added: incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/ContentModelFlag.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/ContentModelFlag.java?rev=587550&view=auto
==============================================================================
--- incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/ContentModelFlag.java (added)
+++ incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/ContentModelFlag.java Tue Oct 23 09:28:51 2007
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2007 Henri Sivonen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a 
+ * copy of this software and associated documentation files (the "Software"), 
+ * to deal in the Software without restriction, including without limitation 
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+ * and/or sell copies of the Software, and to permit persons to whom the 
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+package nu.validator.htmlparser.impl;
+
+public enum ContentModelFlag {
+    PCDATA, RCDATA, CDATA, PLAINTEXT
+}

Added: incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/EmptyAttributes.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/EmptyAttributes.java?rev=587550&view=auto
==============================================================================
--- incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/EmptyAttributes.java (added)
+++ incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/EmptyAttributes.java Tue Oct 23 09:28:51 2007
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2005 Henri Sivonen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a 
+ * copy of this software and associated documentation files (the "Software"), 
+ * to deal in the Software without restriction, including without limitation 
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+ * and/or sell copies of the Software, and to permit persons to whom the 
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+package nu.validator.htmlparser.impl;
+
+import org.xml.sax.Attributes;
+
+/**
+ * @version $Id: EmptyAttributes.java 150 2007-08-16 19:21:25Z hsivonen $
+ * @author hsivonen
+ */
+public class EmptyAttributes implements Attributes {
+
+    public final static EmptyAttributes EMPTY_ATTRIBUTES = new EmptyAttributes();
+    
+    private EmptyAttributes() {
+        
+    }
+    
+    /**
+     * @see org.xml.sax.Attributes#getLength()
+     */
+    public int getLength() {
+        return 0;
+    }
+
+    /**
+     * @see org.xml.sax.Attributes#getURI(int)
+     */
+    public String getURI(int arg0) {
+        return null;
+    }
+
+    /**
+     * @see org.xml.sax.Attributes#getLocalName(int)
+     */
+    public String getLocalName(int arg0) {
+        return null;
+    }
+
+    /**
+     * @see org.xml.sax.Attributes#getQName(int)
+     */
+    public String getQName(int arg0) {
+        return null;
+    }
+
+    /**
+     * @see org.xml.sax.Attributes#getType(int)
+     */
+    public String getType(int arg0) {
+        return null;
+    }
+
+    /**
+     * @see org.xml.sax.Attributes#getValue(int)
+     */
+    public String getValue(int arg0) {
+        return null;
+    }
+
+    /**
+     * @see org.xml.sax.Attributes#getIndex(java.lang.String, java.lang.String)
+     */
+    public int getIndex(String arg0, String arg1) {
+        return -1;
+    }
+
+    /**
+     * @see org.xml.sax.Attributes#getIndex(java.lang.String)
+     */
+    public int getIndex(String arg0) {
+        return -1;
+    }
+
+    /**
+     * @see org.xml.sax.Attributes#getType(java.lang.String, java.lang.String)
+     */
+    public String getType(String arg0, String arg1) {
+        return null;
+    }
+
+    /**
+     * @see org.xml.sax.Attributes#getType(java.lang.String)
+     */
+    public String getType(String arg0) {
+        return null;
+    }
+
+    /**
+     * @see org.xml.sax.Attributes#getValue(java.lang.String, java.lang.String)
+     */
+    public String getValue(String arg0, String arg1) {
+        return null;
+    }
+
+    /**
+     * @see org.xml.sax.Attributes#getValue(java.lang.String)
+     */
+    public String getValue(String arg0) {
+        return null;
+    }
+
+}

Added: incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/EncodingInfo.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/EncodingInfo.java?rev=587550&view=auto
==============================================================================
--- incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/EncodingInfo.java (added)
+++ incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/EncodingInfo.java Tue Oct 23 09:28:51 2007
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2006 Henri Sivonen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a 
+ * copy of this software and associated documentation files (the "Software"), 
+ * to deal in the Software without restriction, including without limitation 
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+ * and/or sell copies of the Software, and to permit persons to whom the 
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+package nu.validator.htmlparser.impl;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.SortedSet;
+import java.util.TreeSet;
+
+public class EncodingInfo {
+
+    private static String[] NOT_OBSCURE = {"Big5",
+        "Big5-HKSCS",
+        "EUC-JP",
+        "EUC-KR",
+        "GB18030",
+        "GBK",
+        "ISO-2022-JP",
+        "ISO-2022-KR",
+        "ISO-8859-1",
+        "ISO-8859-13",
+        "ISO-8859-15",
+        "ISO-8859-2",
+        "ISO-8859-3",
+        "ISO-8859-4",
+        "ISO-8859-5",
+        "ISO-8859-6",
+        "ISO-8859-7",
+        "ISO-8859-8",
+        "ISO-8859-9",
+        "KOI8-R",
+        "Shift_JIS",
+        "TIS-620",
+        "US-ASCII",
+        "UTF-16",
+        "UTF-16BE",
+        "UTF-16LE",
+        "UTF-8",
+        "windows-1250",
+        "windows-1251",
+        "windows-1252",
+        "windows-1253",
+        "windows-1254",
+        "windows-1255",
+        "windows-1256",
+        "windows-1257",
+        "windows-1258"};
+    
+    private static String[] asciiSuperset;
+
+    private static String[] notAsciiSuperset;   
+
+    static {
+        byte[] testBuf = new byte[0x63];
+        for (int i = 0; i < 0x60; i++) {
+            testBuf[i] = (byte) (i + 0x20);
+        }
+        testBuf[0x60] = (byte) '\n';
+        testBuf[0x61] = (byte) '\r';
+        testBuf[0x62] = (byte) '\t';
+
+        SortedSet<String> asciiSupersetSet = new TreeSet<String>();
+        SortedSet<String> notAsciiSupersetSet = new TreeSet<String>();
+        
+        SortedMap charsets = Charset.availableCharsets();
+        for (Iterator iter = charsets.entrySet().iterator(); iter.hasNext();) {
+            Map.Entry entry = (Map.Entry) iter.next();
+            Charset cs = (Charset) entry.getValue();
+            if (asciiMapsToBasicLatin(testBuf, cs)) {
+                asciiSupersetSet.add(cs.name().intern());
+            } else {
+                notAsciiSupersetSet.add(cs.name().intern());
+            }
+        }
+        
+        asciiSuperset = (String[]) asciiSupersetSet.toArray(new String[0]);
+        notAsciiSuperset = (String[]) notAsciiSupersetSet.toArray(new String[0]);
+    }
+
+    public static boolean isAsciiSuperset(String preferredIanaName) {
+        return (Arrays.binarySearch(asciiSuperset, preferredIanaName) > -1);
+    }
+
+    public static boolean isNotAsciiSuperset(String preferredIanaName) {
+        return (Arrays.binarySearch(notAsciiSuperset, preferredIanaName) > -1);
+    }
+
+    public static boolean isObscure(String preferredIanaName) {
+        return !(Arrays.binarySearch(NOT_OBSCURE, preferredIanaName) > -1);
+    }
+    
+    /**
+     * @param testBuf
+     * @param cs
+     */
+    private static boolean asciiMapsToBasicLatin(byte[] testBuf, Charset cs) {
+        CharsetDecoder dec = cs.newDecoder();
+        dec.onMalformedInput(CodingErrorAction.REPORT);
+        dec.onUnmappableCharacter(CodingErrorAction.REPORT);
+        Reader r = new InputStreamReader(new ByteArrayInputStream(testBuf), dec);
+        try {
+            for (int i = 0; i < 0x60; i++) {
+                if ((i + 0x20) != r.read()) {
+                    return false;
+                }
+            }
+            if ('\n' != r.read()) {
+                return false;
+            }
+            if ('\r' != r.read()) {
+                return false;
+            }
+            if ('\t' != r.read()) {
+                return false;
+            }        
+        } catch (IOException e) {
+            return false;
+        } catch (Exception e) {
+            return false;
+        }
+
+        return true;
+    }
+
+    public static void main(String[] args) {
+        System.out.println("ASCII maps to Basic Latin:");
+        for (int i = 0; i < asciiSuperset.length; i++) {
+            System.out.println(asciiSuperset[i]);            
+        }
+        System.out.println();
+        System.out.println("ASCII does not map to Basic Latin:");
+        for (int i = 0; i < notAsciiSuperset.length; i++) {
+            System.out.println(notAsciiSuperset[i]);            
+        }
+    }
+}

Added: incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/Entities.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/Entities.java?rev=587550&view=auto
==============================================================================
--- incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/Entities.java (added)
+++ incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/Entities.java Tue Oct 23 09:28:51 2007
@@ -0,0 +1,796 @@
+/*
+ * Copyright (c) 2005 Henri Sivonen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a 
+ * copy of this software and associated documentation files (the "Software"), 
+ * to deal in the Software without restriction, including without limitation 
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+ * and/or sell copies of the Software, and to permit persons to whom the 
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+package nu.validator.htmlparser.impl;
+
+/**
+ * @version $Id: Entities.java 150 2007-08-16 19:21:25Z hsivonen $
+ * @author hsivonen
+ */
+public final class Entities {
+    static final String[] NAMES = { "AElig",
+        "AElig;",
+        "AMP",
+        "AMP;",
+        "Aacute",
+        "Aacute;",
+        "Acirc",
+        "Acirc;",
+        "Agrave",
+        "Agrave;",
+        "Alpha;",
+        "Aring",
+        "Aring;",
+        "Atilde",
+        "Atilde;",
+        "Auml",
+        "Auml;",
+        "Beta;",
+        "COPY",
+        "COPY;",
+        "Ccedil",
+        "Ccedil;",
+        "Chi;",
+        "Dagger;",
+        "Delta;",
+        "ETH",
+        "ETH;",
+        "Eacute",
+        "Eacute;",
+        "Ecirc",
+        "Ecirc;",
+        "Egrave",
+        "Egrave;",
+        "Epsilon;",
+        "Eta;",
+        "Euml",
+        "Euml;",
+        "GT",
+        "GT;",
+        "Gamma;",
+        "Iacute",
+        "Iacute;",
+        "Icirc",
+        "Icirc;",
+        "Igrave",
+        "Igrave;",
+        "Iota;",
+        "Iuml",
+        "Iuml;",
+        "Kappa;",
+        "LT",
+        "LT;",
+        "Lambda;",
+        "Mu;",
+        "Ntilde",
+        "Ntilde;",
+        "Nu;",
+        "OElig;",
+        "Oacute",
+        "Oacute;",
+        "Ocirc",
+        "Ocirc;",
+        "Ograve",
+        "Ograve;",
+        "Omega;",
+        "Omicron;",
+        "Oslash",
+        "Oslash;",
+        "Otilde",
+        "Otilde;",
+        "Ouml",
+        "Ouml;",
+        "Phi;",
+        "Pi;",
+        "Prime;",
+        "Psi;",
+        "QUOT",
+        "QUOT;",
+        "REG",
+        "REG;",
+        "Rho;",
+        "Scaron;",
+        "Sigma;",
+        "THORN",
+        "THORN;",
+        "TRADE;",
+        "Tau;",
+        "Theta;",
+        "Uacute",
+        "Uacute;",
+        "Ucirc",
+        "Ucirc;",
+        "Ugrave",
+        "Ugrave;",
+        "Upsilon;",
+        "Uuml",
+        "Uuml;",
+        "Xi;",
+        "Yacute",
+        "Yacute;",
+        "Yuml;",
+        "Zeta;",
+        "aacute",
+        "aacute;",
+        "acirc",
+        "acirc;",
+        "acute",
+        "acute;",
+        "aelig",
+        "aelig;",
+        "agrave",
+        "agrave;",
+        "alefsym;",
+        "alpha;",
+        "amp",
+        "amp;",
+        "and;",
+        "ang;",
+        "apos;",
+        "aring",
+        "aring;",
+        "asymp;",
+        "atilde",
+        "atilde;",
+        "auml",
+        "auml;",
+        "bdquo;",
+        "beta;",
+        "brvbar",
+        "brvbar;",
+        "bull;",
+        "cap;",
+        "ccedil",
+        "ccedil;",
+        "cedil",
+        "cedil;",
+        "cent",
+        "cent;",
+        "chi;",
+        "circ;",
+        "clubs;",
+        "cong;",
+        "copy",
+        "copy;",
+        "crarr;",
+        "cup;",
+        "curren",
+        "curren;",
+        "dArr;",
+        "dagger;",
+        "darr;",
+        "deg",
+        "deg;",
+        "delta;",
+        "diams;",
+        "divide",
+        "divide;",
+        "eacute",
+        "eacute;",
+        "ecirc",
+        "ecirc;",
+        "egrave",
+        "egrave;",
+        "empty;",
+        "emsp;",
+        "ensp;",
+        "epsilon;",
+        "equiv;",
+        "eta;",
+        "eth",
+        "eth;",
+        "euml",
+        "euml;",
+        "euro;",
+        "exist;",
+        "fnof;",
+        "forall;",
+        "frac12",
+        "frac12;",
+        "frac14",
+        "frac14;",
+        "frac34",
+        "frac34;",
+        "frasl;",
+        "gamma;",
+        "ge;",
+        "gt",
+        "gt;",
+        "hArr;",
+        "harr;",
+        "hearts;",
+        "hellip;",
+        "iacute",
+        "iacute;",
+        "icirc",
+        "icirc;",
+        "iexcl",
+        "iexcl;",
+        "igrave",
+        "igrave;",
+        "image;",
+        "infin;",
+        "int;",
+        "iota;",
+        "iquest",
+        "iquest;",
+        "isin;",
+        "iuml",
+        "iuml;",
+        "kappa;",
+        "lArr;",
+        "lambda;",
+        "lang;",
+        "laquo",
+        "laquo;",
+        "larr;",
+        "lceil;",
+        "ldquo;",
+        "le;",
+        "lfloor;",
+        "lowast;",
+        "loz;",
+        "lrm;",
+        "lsaquo;",
+        "lsquo;",
+        "lt",
+        "lt;",
+        "macr",
+        "macr;",
+        "mdash;",
+        "micro",
+        "micro;",
+        "middot",
+        "middot;",
+        "minus;",
+        "mu;",
+        "nabla;",
+        "nbsp",
+        "nbsp;",
+        "ndash;",
+        "ne;",
+        "ni;",
+        "not",
+        "not;",
+        "notin;",
+        "nsub;",
+        "ntilde",
+        "ntilde;",
+        "nu;",
+        "oacute",
+        "oacute;",
+        "ocirc",
+        "ocirc;",
+        "oelig;",
+        "ograve",
+        "ograve;",
+        "oline;",
+        "omega;",
+        "omicron;",
+        "oplus;",
+        "or;",
+        "ordf",
+        "ordf;",
+        "ordm",
+        "ordm;",
+        "oslash",
+        "oslash;",
+        "otilde",
+        "otilde;",
+        "otimes;",
+        "ouml",
+        "ouml;",
+        "para",
+        "para;",
+        "part;",
+        "permil;",
+        "perp;",
+        "phi;",
+        "pi;",
+        "piv;",
+        "plusmn",
+        "plusmn;",
+        "pound",
+        "pound;",
+        "prime;",
+        "prod;",
+        "prop;",
+        "psi;",
+        "quot",
+        "quot;",
+        "rArr;",
+        "radic;",
+        "rang;",
+        "raquo",
+        "raquo;",
+        "rarr;",
+        "rceil;",
+        "rdquo;",
+        "real;",
+        "reg",
+        "reg;",
+        "rfloor;",
+        "rho;",
+        "rlm;",
+        "rsaquo;",
+        "rsquo;",
+        "sbquo;",
+        "scaron;",
+        "sdot;",
+        "sect",
+        "sect;",
+        "shy",
+        "shy;",
+        "sigma;",
+        "sigmaf;",
+        "sim;",
+        "spades;",
+        "sub;",
+        "sube;",
+        "sum;",
+        "sup1",
+        "sup1;",
+        "sup2",
+        "sup2;",
+        "sup3",
+        "sup3;",
+        "sup;",
+        "supe;",
+        "szlig",
+        "szlig;",
+        "tau;",
+        "there4;",
+        "theta;",
+        "thetasym;",
+        "thinsp;",
+        "thorn",
+        "thorn;",
+        "tilde;",
+        "times",
+        "times;",
+        "trade;",
+        "uArr;",
+        "uacute",
+        "uacute;",
+        "uarr;",
+        "ucirc",
+        "ucirc;",
+        "ugrave",
+        "ugrave;",
+        "uml",
+        "uml;",
+        "upsih;",
+        "upsilon;",
+        "uuml",
+        "uuml;",
+        "weierp;",
+        "xi;",
+        "yacute",
+        "yacute;",
+        "yen",
+        "yen;",
+        "yuml",
+        "yuml;",
+        "zeta;",
+        "zwj;",
+        "zwnj;" };
+
+    final static char[][] VALUES = { {'\u00C6'},
+        {'\u00C6'},
+        {'\u0026'},
+        {'\u0026'},
+        {'\u00C1'},
+        {'\u00C1'},
+        {'\u00C2'},
+        {'\u00C2'},
+        {'\u00C0'},
+        {'\u00C0'},
+        {'\u0391'},
+        {'\u00C5'},
+        {'\u00C5'},
+        {'\u00C3'},
+        {'\u00C3'},
+        {'\u00C4'},
+        {'\u00C4'},
+        {'\u0392'},
+        {'\u00A9'},
+        {'\u00A9'},
+        {'\u00C7'},
+        {'\u00C7'},
+        {'\u03A7'},
+        {'\u2021'},
+        {'\u0394'},
+        {'\u00D0'},
+        {'\u00D0'},
+        {'\u00C9'},
+        {'\u00C9'},
+        {'\u00CA'},
+        {'\u00CA'},
+        {'\u00C8'},
+        {'\u00C8'},
+        {'\u0395'},
+        {'\u0397'},
+        {'\u00CB'},
+        {'\u00CB'},
+        {'\u003E'},
+        {'\u003E'},
+        {'\u0393'},
+        {'\u00CD'},
+        {'\u00CD'},
+        {'\u00CE'},
+        {'\u00CE'},
+        {'\u00CC'},
+        {'\u00CC'},
+        {'\u0399'},
+        {'\u00CF'},
+        {'\u00CF'},
+        {'\u039A'},
+        {'\u003C'},
+        {'\u003C'},
+        {'\u039B'},
+        {'\u039C'},
+        {'\u00D1'},
+        {'\u00D1'},
+        {'\u039D'},
+        {'\u0152'},
+        {'\u00D3'},
+        {'\u00D3'},
+        {'\u00D4'},
+        {'\u00D4'},
+        {'\u00D2'},
+        {'\u00D2'},
+        {'\u03A9'},
+        {'\u039F'},
+        {'\u00D8'},
+        {'\u00D8'},
+        {'\u00D5'},
+        {'\u00D5'},
+        {'\u00D6'},
+        {'\u00D6'},
+        {'\u03A6'},
+        {'\u03A0'},
+        {'\u2033'},
+        {'\u03A8'},
+        {'\u0022'},
+        {'\u0022'},
+        {'\u00AE'},
+        {'\u00AE'},
+        {'\u03A1'},
+        {'\u0160'},
+        {'\u03A3'},
+        {'\u00DE'},
+        {'\u00DE'},
+        {'\u2122'},
+        {'\u03A4'},
+        {'\u0398'},
+        {'\u00DA'},
+        {'\u00DA'},
+        {'\u00DB'},
+        {'\u00DB'},
+        {'\u00D9'},
+        {'\u00D9'},
+        {'\u03A5'},
+        {'\u00DC'},
+        {'\u00DC'},
+        {'\u039E'},
+        {'\u00DD'},
+        {'\u00DD'},
+        {'\u0178'},
+        {'\u0396'},
+        {'\u00E1'},
+        {'\u00E1'},
+        {'\u00E2'},
+        {'\u00E2'},
+        {'\u00B4'},
+        {'\u00B4'},
+        {'\u00E6'},
+        {'\u00E6'},
+        {'\u00E0'},
+        {'\u00E0'},
+        {'\u2135'},
+        {'\u03B1'},
+        {'\u0026'},
+        {'\u0026'},
+        {'\u2227'},
+        {'\u2220'},
+        {'\''},
+        {'\u00E5'},
+        {'\u00E5'},
+        {'\u2248'},
+        {'\u00E3'},
+        {'\u00E3'},
+        {'\u00E4'},
+        {'\u00E4'},
+        {'\u201E'},
+        {'\u03B2'},
+        {'\u00A6'},
+        {'\u00A6'},
+        {'\u2022'},
+        {'\u2229'},
+        {'\u00E7'},
+        {'\u00E7'},
+        {'\u00B8'},
+        {'\u00B8'},
+        {'\u00A2'},
+        {'\u00A2'},
+        {'\u03C7'},
+        {'\u02C6'},
+        {'\u2663'},
+        {'\u2245'},
+        {'\u00A9'},
+        {'\u00A9'},
+        {'\u21B5'},
+        {'\u222A'},
+        {'\u00A4'},
+        {'\u00A4'},
+        {'\u21D3'},
+        {'\u2020'},
+        {'\u2193'},
+        {'\u00B0'},
+        {'\u00B0'},
+        {'\u03B4'},
+        {'\u2666'},
+        {'\u00F7'},
+        {'\u00F7'},
+        {'\u00E9'},
+        {'\u00E9'},
+        {'\u00EA'},
+        {'\u00EA'},
+        {'\u00E8'},
+        {'\u00E8'},
+        {'\u2205'},
+        {'\u2003'},
+        {'\u2002'},
+        {'\u03B5'},
+        {'\u2261'},
+        {'\u03B7'},
+        {'\u00F0'},
+        {'\u00F0'},
+        {'\u00EB'},
+        {'\u00EB'},
+        {'\u20AC'},
+        {'\u2203'},
+        {'\u0192'},
+        {'\u2200'},
+        {'\u00BD'},
+        {'\u00BD'},
+        {'\u00BC'},
+        {'\u00BC'},
+        {'\u00BE'},
+        {'\u00BE'},
+        {'\u2044'},
+        {'\u03B3'},
+        {'\u2265'},
+        {'\u003E'},
+        {'\u003E'},
+        {'\u21D4'},
+        {'\u2194'},
+        {'\u2665'},
+        {'\u2026'},
+        {'\u00ED'},
+        {'\u00ED'},
+        {'\u00EE'},
+        {'\u00EE'},
+        {'\u00A1'},
+        {'\u00A1'},
+        {'\u00EC'},
+        {'\u00EC'},
+        {'\u2111'},
+        {'\u221E'},
+        {'\u222B'},
+        {'\u03B9'},
+        {'\u00BF'},
+        {'\u00BF'},
+        {'\u2208'},
+        {'\u00EF'},
+        {'\u00EF'},
+        {'\u03BA'},
+        {'\u21D0'},
+        {'\u03BB'},
+        {'\u3008'},
+        {'\u00AB'},
+        {'\u00AB'},
+        {'\u2190'},
+        {'\u2308'},
+        {'\u201C'},
+        {'\u2264'},
+        {'\u230A'},
+        {'\u2217'},
+        {'\u25CA'},
+        {'\u200E'},
+        {'\u2039'},
+        {'\u2018'},
+        {'\u003C'},
+        {'\u003C'},
+        {'\u00AF'},
+        {'\u00AF'},
+        {'\u2014'},
+        {'\u00B5'},
+        {'\u00B5'},
+        {'\u00B7'},
+        {'\u00B7'},
+        {'\u2212'},
+        {'\u03BC'},
+        {'\u2207'},
+        {'\u00A0'},
+        {'\u00A0'},
+        {'\u2013'},
+        {'\u2260'},
+        {'\u220B'},
+        {'\u00AC'},
+        {'\u00AC'},
+        {'\u2209'},
+        {'\u2284'},
+        {'\u00F1'},
+        {'\u00F1'},
+        {'\u03BD'},
+        {'\u00F3'},
+        {'\u00F3'},
+        {'\u00F4'},
+        {'\u00F4'},
+        {'\u0153'},
+        {'\u00F2'},
+        {'\u00F2'},
+        {'\u203E'},
+        {'\u03C9'},
+        {'\u03BF'},
+        {'\u2295'},
+        {'\u2228'},
+        {'\u00AA'},
+        {'\u00AA'},
+        {'\u00BA'},
+        {'\u00BA'},
+        {'\u00F8'},
+        {'\u00F8'},
+        {'\u00F5'},
+        {'\u00F5'},
+        {'\u2297'},
+        {'\u00F6'},
+        {'\u00F6'},
+        {'\u00B6'},
+        {'\u00B6'},
+        {'\u2202'},
+        {'\u2030'},
+        {'\u22A5'},
+        {'\u03C6'},
+        {'\u03C0'},
+        {'\u03D6'},
+        {'\u00B1'},
+        {'\u00B1'},
+        {'\u00A3'},
+        {'\u00A3'},
+        {'\u2032'},
+        {'\u220F'},
+        {'\u221D'},
+        {'\u03C8'},
+        {'\u0022'},
+        {'\u0022'},
+        {'\u21D2'},
+        {'\u221A'},
+        {'\u3009'},
+        {'\u00BB'},
+        {'\u00BB'},
+        {'\u2192'},
+        {'\u2309'},
+        {'\u201D'},
+        {'\u211C'},
+        {'\u00AE'},
+        {'\u00AE'},
+        {'\u230B'},
+        {'\u03C1'},
+        {'\u200F'},
+        {'\u203A'},
+        {'\u2019'},
+        {'\u201A'},
+        {'\u0161'},
+        {'\u22C5'},
+        {'\u00A7'},
+        {'\u00A7'},
+        {'\u00AD'},
+        {'\u00AD'},
+        {'\u03C3'},
+        {'\u03C2'},
+        {'\u223C'},
+        {'\u2660'},
+        {'\u2282'},
+        {'\u2286'},
+        {'\u2211'},
+        {'\u00B9'},
+        {'\u00B9'},
+        {'\u00B2'},
+        {'\u00B2'},
+        {'\u00B3'},
+        {'\u00B3'},
+        {'\u2283'},
+        {'\u2287'},
+        {'\u00DF'},
+        {'\u00DF'},
+        {'\u03C4'},
+        {'\u2234'},
+        {'\u03B8'},
+        {'\u03D1'},
+        {'\u2009'},
+        {'\u00FE'},
+        {'\u00FE'},
+        {'\u02DC'},
+        {'\u00D7'},
+        {'\u00D7'},
+        {'\u2122'},
+        {'\u21D1'},
+        {'\u00FA'},
+        {'\u00FA'},
+        {'\u2191'},
+        {'\u00FB'},
+        {'\u00FB'},
+        {'\u00F9'},
+        {'\u00F9'},
+        {'\u00A8'},
+        {'\u00A8'},
+        {'\u03D2'},
+        {'\u03C5'},
+        {'\u00FC'},
+        {'\u00FC'},
+        {'\u2118'},
+        {'\u03BE'},
+        {'\u00FD'},
+        {'\u00FD'},
+        {'\u00A5'},
+        {'\u00A5'},
+        {'\u00FF'},
+        {'\u00FF'},
+        {'\u03B6'},
+        {'\u200D'},
+        {'\u200C'} };
+    
+    final static char[][] WINDOWS_1252 = {{'\u20AC'},
+        {'\uFFFD'},
+        {'\u201A'},
+        {'\u0192'},
+        {'\u201E'},
+        {'\u2026'},
+        {'\u2020'},
+        {'\u2021'},
+        {'\u02C6'},
+        {'\u2030'},
+        {'\u0160'},
+        {'\u2039'},
+        {'\u0152'},
+        {'\uFFFD'},
+        {'\u017D'},
+        {'\uFFFD'},
+        {'\uFFFD'},
+        {'\u2018'},
+        {'\u2019'},
+        {'\u201C'},
+        {'\u201D'},
+        {'\u2022'},
+        {'\u2013'},
+        {'\u2014'},
+        {'\u02DC'},
+        {'\u2122'},
+        {'\u0161'},
+        {'\u203A'},
+        {'\u0153'},
+        {'\uFFFD'},
+        {'\u017E'},
+        {'\u0178'}};
+}

Added: incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/HtmlInputStreamReader.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/HtmlInputStreamReader.java?rev=587550&view=auto
==============================================================================
--- incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/HtmlInputStreamReader.java (added)
+++ incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/HtmlInputStreamReader.java Tue Oct 23 09:28:51 2007
@@ -0,0 +1,452 @@
+/*
+ * Copyright (c) 2007 Henri Sivonen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a 
+ * copy of this software and associated documentation files (the "Software"), 
+ * to deal in the Software without restriction, including without limitation 
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+ * and/or sell copies of the Software, and to permit persons to whom the 
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+package nu.validator.htmlparser.impl;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CoderResult;
+import java.nio.charset.CodingErrorAction;
+
+
+import org.xml.sax.ErrorHandler;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+import org.xml.sax.SAXParseException;
+
+/**
+ * Be very careful with this class. It is not a general-purpose subclass of of
+ * <code>Reader</code>. Instead, it is the minimal implementation that does
+ * what <code>Tokenizer</code> needs while being an instance of
+ * <code>Reader</code>.
+ * 
+ * The only reason why this is a public class is that it needs to be visible to
+ * test code in another package.
+ * 
+ * @version $Id: HtmlInputStreamReader.java 150 2007-08-16 19:21:25Z hsivonen $
+ * @author hsivonen
+ */
+public final class HtmlInputStreamReader extends Reader implements
+        ByteReadable, Locator {
+
+    private static final int SNIFFING_LIMIT = 512;
+
+    private final InputStream inputStream;
+
+    private final ErrorHandler errorHandler;
+
+    private final Locator locator;
+
+    private final Tokenizer tokenizer;
+
+    private CharsetDecoder decoder = null;
+
+    private boolean sniffing = true;
+
+    private int limit = 0;
+
+    private int position = 0;
+
+    private int bytesRead = 0;
+
+    private boolean eofSeen = false;
+
+    private boolean shouldReadBytes = false;
+
+    private boolean charsetBoundaryPassed = false;
+
+    private final byte[] byteArray = new byte[4096]; // Length must be >=
+
+    // SNIFFING_LIMIT
+
+    private final ByteBuffer byteBuffer = ByteBuffer.wrap(byteArray);
+
+    private boolean needToNotifyTokenizer = false;
+
+    private boolean flushing = false;
+
+    private int line = -1;
+
+    private int col = -1;
+
+    private int lineColPos;
+
+    /**
+     * @param inputStream
+     * @param errorHandler
+     * @param locator
+     * @throws IOException
+     * @throws SAXException
+     */
+    public HtmlInputStreamReader(InputStream inputStream,
+            ErrorHandler errorHandler, Locator locator, Tokenizer tokenizer)
+            throws SAXException, IOException {
+        this.inputStream = inputStream;
+        this.errorHandler = errorHandler;
+        this.locator = locator;
+        this.tokenizer = tokenizer;
+        this.sniffing = true;
+        this.decoder = (new BomSniffer(this)).sniff();
+        if (this.decoder == null) {
+            position = 0;
+            this.decoder = (new MetaSniffer(this, errorHandler, this)).sniff();
+            sniffing = false;
+            // TODO chardet
+            if (this.decoder == null) {
+                if (tokenizer != null) {
+                    tokenizer.noEncodingDeclared();
+                }
+                err("Could not determine the character encoding of the document. Using \u201CWindows-1252\u201D.");
+                this.decoder = Charset.forName("Windows-1252").newDecoder();
+            }
+        }
+        sniffing = false;
+        position = 0;
+        bytesRead = 0;
+        byteBuffer.position(position);
+        byteBuffer.limit(limit);
+        initDecoder();
+    }
+
+    /**
+     * 
+     */
+    private void initDecoder() {
+        if ("ISO-8859-1".equals(this.decoder.charset().name())) {
+            this.decoder = Charset.forName("Windows-1252").newDecoder();
+        }
+        this.decoder.onMalformedInput(CodingErrorAction.REPORT);
+        this.decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
+    }
+
+    public HtmlInputStreamReader(InputStream inputStream,
+            ErrorHandler errorHandler, Locator locator, Tokenizer tokenizer,
+            CharsetDecoder decoder) throws SAXException, IOException {
+        this.inputStream = inputStream;
+        this.errorHandler = errorHandler;
+        this.locator = locator;
+        this.tokenizer = tokenizer;
+        this.decoder = decoder;
+        this.sniffing = false;
+        position = 0;
+        bytesRead = 0;
+        byteBuffer.position(0);
+        byteBuffer.limit(0);
+        shouldReadBytes = true;
+        initDecoder();
+    }
+
+    @Override
+    public void close() throws IOException {
+        // TODO Auto-generated method stub
+        inputStream.close();
+    }
+
+    @Override
+    public int read(char[] charArray) throws IOException {
+        lineColPos = 0;
+        if (sniffing) {
+            throw new IllegalStateException(
+                    "read() called when in the sniffing state.");
+        }
+        assert charArray.length >= 2;
+        if (needToNotifyTokenizer) {
+            if (tokenizer != null) {
+                tokenizer.notifyAboutMetaBoundary();
+            }
+            needToNotifyTokenizer = false;
+        }
+        CharBuffer charBuffer = CharBuffer.wrap(charArray);
+        charBuffer.limit(charArray.length);
+        charBuffer.position(0);
+        if (flushing) {
+            decoder.flush(charBuffer);
+            // return -1 if zero
+            int cPos = charBuffer.position();
+            return cPos == 0 ? -1 : cPos;
+        }
+        outer: for (;;) {
+            if (shouldReadBytes) {
+                int oldLimit = byteBuffer.limit();
+                int readLen;
+                if (charsetBoundaryPassed) {
+                    readLen = byteArray.length - oldLimit;
+                } else {
+                    readLen = SNIFFING_LIMIT - oldLimit;
+                }
+                int num = inputStream.read(byteArray, oldLimit, readLen);
+                if (num == -1) {
+                    eofSeen = true;
+                    inputStream.close();
+                } else {
+                    byteBuffer.position(0);
+                    byteBuffer.limit(oldLimit + num);
+                }
+                shouldReadBytes = false;
+            }
+            boolean finalDecode = false;
+            for (;;) {
+                int oldBytePos = byteBuffer.position();
+                CoderResult cr = decoder.decode(byteBuffer, charBuffer,
+                        finalDecode);
+                bytesRead += byteBuffer.position() - oldBytePos;
+                if (cr == CoderResult.OVERFLOW) {
+                    // Decoder will remember surrogates
+                    return charBuffer.position();
+                } else if (cr == CoderResult.UNDERFLOW) {
+                    int remaining = byteBuffer.remaining();
+                    if (!charsetBoundaryPassed) {
+                        if (bytesRead + remaining >= SNIFFING_LIMIT) {
+                            needToNotifyTokenizer = true;
+                        }
+                    }
+
+                    // XXX what happens if the entire byte buffer consists of 
+                    // a pathologically long malformed sequence?
+                    
+                    // If the buffer was not fully consumed, there may be an
+                    // incomplete byte sequence that needs to seed the next
+                    // buffer.
+                    if (remaining > 0) {
+                        System.arraycopy(byteArray, byteBuffer.position(),
+                                byteArray, 0, remaining);
+                    }
+                    byteBuffer.position(0);
+                    byteBuffer.limit(remaining);
+                    if (flushing) {
+                        // The final decode was successful. Not sure if this
+                        // ever happens.
+                        // Let's get out in any case.
+                        int cPos = charBuffer.position();
+                        return cPos == 0 ? -1 : cPos;
+                    } else if (eofSeen) {
+                        // If there's something left, it isn't something that
+                        // would be
+                        // consumed in the middle of the stream. Rerun the loop
+                        // once
+                        // in the final mode.
+                        shouldReadBytes = false;
+                        finalDecode = true;
+                        flushing = true;
+                        continue;
+                    } else {
+                        // The usual stuff. Want more bytes next time.
+                        shouldReadBytes = true;
+                        return charBuffer.position();
+                    }
+                } else {
+                    // The result is in error. No need to test.
+                    StringBuilder sb = new StringBuilder();
+                    for (int i = 0; i < cr.length(); i++) {
+                        if (i > 0) {
+                            sb.append(", ");
+                        }
+                        sb.append('\u201C');
+                        sb.append(Integer.toHexString(byteBuffer.get() & 0xFF));
+                        bytesRead++;
+                        sb.append('\u201D');
+                    }
+                    charBuffer.put('\uFFFD');
+                    calculateLineAndCol(charBuffer);
+                    if (cr.isMalformed()) {
+                        err("Malformed byte sequence: " + sb + ".");
+                    } else if (cr.isUnmappable()) {
+                        err("Unmappable byte sequence: " + sb + ".");
+                    } else {
+                        throw new RuntimeException(
+                                "CoderResult was none of overflow, underflow, malformed or unmappable.");
+                    }
+                    if (finalDecode) {
+                        // These were the last bytes of input. Return without
+                        // relooping.
+                        return charBuffer.position();
+                    }
+                }
+            }
+        }
+    }
+
+    private void calculateLineAndCol(CharBuffer charBuffer) {
+        if (locator != null) {
+            line = locator.getLineNumber();
+            col = locator.getColumnNumber();
+            char[] charArray = charBuffer.array();
+            boolean prevWasCR = false;
+            int i;
+            for (i = lineColPos; i < charBuffer.position(); i++) {
+                switch (charArray[i]) {
+                    case '\n': // LF
+                        if (!prevWasCR) {
+                            line++;
+                            col = 0;
+                        }
+                        prevWasCR = false;
+                        break;
+                    case '\r': // CR
+                        line++;
+                        col = 0;
+                        prevWasCR = true;
+                        break;
+                    default:
+                        col++;
+                        prevWasCR = false;
+                        break;
+                }
+            }
+            lineColPos = i;
+        }
+    }
+
+    public int readByte() throws IOException {
+        if (!sniffing) {
+            throw new IllegalStateException(
+                    "readByte() called when not in the sniffing state.");
+        }
+        if (position == SNIFFING_LIMIT) {
+            return -1;
+        } else if (position < limit) {
+            return byteArray[position++] & 0xFF;
+        } else {
+            int num = inputStream.read(byteArray, limit, SNIFFING_LIMIT - limit);
+            if (num == -1) {
+                return -1;
+            } else {
+                limit += num;
+                return byteArray[position++] & 0xFF;
+            }
+        }
+    }
+
+    public static void main(String[] args) {
+        CharsetDecoder dec = Charset.forName("UTF-8").newDecoder();
+        dec.onMalformedInput(CodingErrorAction.REPORT);
+        dec.onUnmappableCharacter(CodingErrorAction.REPORT);
+        byte[] bytes = { (byte) 0xF0, (byte) 0x9D, (byte) 0x80, (byte) 0x80 };
+        byte[] bytes2 = { (byte) 0xB8, (byte) 0x80, 0x63, 0x64, 0x65 };
+        ByteBuffer byteBuf = ByteBuffer.wrap(bytes);
+        ByteBuffer byteBuf2 = ByteBuffer.wrap(bytes2);
+        char[] chars = new char[1];
+        CharBuffer charBuf = CharBuffer.wrap(chars);
+
+        CoderResult cr = dec.decode(byteBuf, charBuf, false);
+        System.out.println(cr);
+        System.out.println(byteBuf);
+        // byteBuf.get();
+        cr = dec.decode(byteBuf2, charBuf, false);
+        System.out.println(cr);
+        System.out.println(byteBuf2);
+
+    }
+
+    public int getColumnNumber() {
+        if (locator != null) {
+            return col;
+        }
+        return -1;
+    }
+
+    public int getLineNumber() {
+        if (locator != null) {
+            return line;
+        }
+        return -1;
+    }
+
+    public String getPublicId() {
+        if (locator != null) {
+            return locator.getPublicId();
+        }
+        return null;
+    }
+
+    public String getSystemId() {
+        if (locator != null) {
+            return locator.getSystemId();
+        }
+        return null;
+    }
+
+    /**
+     * @param string
+     * @throws SAXException
+     */
+    private void err(String message) throws IOException {
+        // TODO remove wrapping when changing read() to take a CharBuffer
+        try {
+            if (errorHandler != null) {
+                SAXParseException spe = new SAXParseException(message, this);
+                errorHandler.error(spe);
+            }
+        } catch (SAXException e) {
+            throw (IOException) new IOException(e.getMessage()).initCause(e);
+        }
+    }
+
+    /**
+     * @param string
+     * @throws SAXException
+     */
+    private void warn(String message) throws IOException {
+        try {
+            if (errorHandler != null) {
+                SAXParseException spe = new SAXParseException(message, this);
+                errorHandler.warning(spe);
+            }
+        } catch (SAXException e) {
+            throw (IOException) new IOException(e.getMessage()).initCause(e);
+        }
+    }
+
+    public Charset getCharset() {
+        return decoder.charset();
+    }
+
+    /**
+     * @see java.io.Reader#read()
+     */
+    @Override
+    public int read() throws IOException {
+        throw new UnsupportedOperationException();
+    }
+
+    /**
+     * @see java.io.Reader#read(char[], int, int)
+     */
+    @Override
+    public int read(char[] cbuf, int off, int len) throws IOException {
+        throw new UnsupportedOperationException();
+    }
+
+    /**
+     * @see java.io.Reader#read(java.nio.CharBuffer)
+     */
+    @Override
+    public int read(CharBuffer target) throws IOException {
+        throw new UnsupportedOperationException();
+    }
+
+}



Mime
View raw message