Return-Path: Delivered-To: apmail-incubator-connectors-commits-archive@minotaur.apache.org Received: (qmail 3651 invoked from network); 5 Mar 2010 10:51:07 -0000 Received: from unknown (HELO mail.apache.org) (140.211.11.3) by 140.211.11.9 with SMTP; 5 Mar 2010 10:51:07 -0000 Received: (qmail 88131 invoked by uid 500); 5 Mar 2010 10:50:54 -0000 Delivered-To: apmail-incubator-connectors-commits-archive@incubator.apache.org Received: (qmail 88097 invoked by uid 500); 5 Mar 2010 10:50:54 -0000 Mailing-List: contact connectors-commits-help@incubator.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: connectors-dev@incubator.apache.org Delivered-To: mailing list connectors-commits@incubator.apache.org Received: (qmail 88090 invoked by uid 99); 5 Mar 2010 10:50:54 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 05 Mar 2010 10:50:54 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=10.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 05 Mar 2010 10:50:53 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id 79A2223888E7; Fri, 5 Mar 2010 10:50:32 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r919366 - in /incubator/lcf/trunk/upstream/xerces2-j/src/org/apache/xerces/impl: Constants.java XMLDocumentScannerImpl.java XMLEntityManager.java XMLScanner.java Date: Fri, 05 Mar 2010 10:50:32 -0000 To: connectors-commits@incubator.apache.org From: kwright@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20100305105032.79A2223888E7@eris.apache.org> Author: kwright Date: Fri Mar 5 10:50:32 2010 New Revision: 919366 URL: http://svn.apache.org/viewvc?rev=919366&view=rev Log: Add required patches to xerces-2 java, for LCF. These changes permit acceptance of bad UTF-8 sequences, and fix some of the more egregious problems with allowing the parse to continue after a fatal error. Modified: incubator/lcf/trunk/upstream/xerces2-j/src/org/apache/xerces/impl/Constants.java incubator/lcf/trunk/upstream/xerces2-j/src/org/apache/xerces/impl/XMLDocumentScannerImpl.java incubator/lcf/trunk/upstream/xerces2-j/src/org/apache/xerces/impl/XMLEntityManager.java incubator/lcf/trunk/upstream/xerces2-j/src/org/apache/xerces/impl/XMLScanner.java Modified: incubator/lcf/trunk/upstream/xerces2-j/src/org/apache/xerces/impl/Constants.java URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/upstream/xerces2-j/src/org/apache/xerces/impl/Constants.java?rev=919366&r1=919365&r2=919366&view=diff ============================================================================== --- incubator/lcf/trunk/upstream/xerces2-j/src/org/apache/xerces/impl/Constants.java (original) +++ incubator/lcf/trunk/upstream/xerces2-j/src/org/apache/xerces/impl/Constants.java Fri Mar 5 10:50:32 2010 @@ -219,6 +219,9 @@ /** Warn on undeclared element feature ("validation/warn-on-undeclared-elemdef"). */ public static final String WARN_ON_UNDECLARED_ELEMDEF_FEATURE = "validation/warn-on-undeclared-elemdef"; + + /** Ignore misencoded characters feature */ + public static final String IGNORE_BADLY_ENCODED_CHARS = "ignore-badly-encoded-chars"; /** Warn on duplicate entity declaration feature ("warn-on-duplicate-entitydef"). */ public static final String WARN_ON_DUPLICATE_ENTITYDEF_FEATURE = "warn-on-duplicate-entitydef"; Modified: incubator/lcf/trunk/upstream/xerces2-j/src/org/apache/xerces/impl/XMLDocumentScannerImpl.java URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/upstream/xerces2-j/src/org/apache/xerces/impl/XMLDocumentScannerImpl.java?rev=919366&r1=919365&r2=919366&view=diff ============================================================================== --- incubator/lcf/trunk/upstream/xerces2-j/src/org/apache/xerces/impl/XMLDocumentScannerImpl.java (original) +++ incubator/lcf/trunk/upstream/xerces2-j/src/org/apache/xerces/impl/XMLDocumentScannerImpl.java Fri Mar 5 10:50:32 2010 @@ -783,6 +783,8 @@ else { reportFatalError("MarkupNotRecognizedInProlog", null); + // Don't loop forever! + fEntityScanner.scanChar(); } } else if (isValidNameStartChar(fEntityScanner.peekChar())) { @@ -802,6 +804,8 @@ else { reportFatalError("MarkupNotRecognizedInProlog", null); + // Don't loop forever! + fEntityScanner.scanChar(); } break; } @@ -872,6 +876,8 @@ } case SCANNER_STATE_REFERENCE: { reportFatalError("ReferenceIllegalInProlog", null); + // Don't loop forever! + fEntityScanner.scanChar(); } } } while (complete || again); @@ -1277,6 +1283,8 @@ else { reportFatalError("MarkupNotRecognizedInMisc", null); + // Skip forward one character, otherwise we loop forever. + fEntityScanner.scanChar(); } break; } Modified: incubator/lcf/trunk/upstream/xerces2-j/src/org/apache/xerces/impl/XMLEntityManager.java URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/upstream/xerces2-j/src/org/apache/xerces/impl/XMLEntityManager.java?rev=919366&r1=919365&r2=919366&view=diff ============================================================================== --- incubator/lcf/trunk/upstream/xerces2-j/src/org/apache/xerces/impl/XMLEntityManager.java (original) +++ incubator/lcf/trunk/upstream/xerces2-j/src/org/apache/xerces/impl/XMLEntityManager.java Fri Mar 5 10:50:32 2010 @@ -132,6 +132,10 @@ protected static final String PARSER_SETTINGS = Constants.XERCES_FEATURE_PREFIX + Constants.PARSER_SETTINGS; + /** Feature identifier: ignore badly encoded characters */ + protected static final String IGNORE_BADLY_ENCODED_CHARS = + Constants.XERCES_FEATURE_PREFIX + Constants.IGNORE_BADLY_ENCODED_CHARS; + // property identifiers /** Property identifier: symbol table. */ @@ -167,7 +171,8 @@ EXTERNAL_PARAMETER_ENTITIES, ALLOW_JAVA_ENCODINGS, WARN_ON_DUPLICATE_ENTITYDEF, - STANDARD_URI_CONFORMANT + STANDARD_URI_CONFORMANT, + IGNORE_BADLY_ENCODED_CHARS }; /** Feature defaults. */ @@ -177,6 +182,7 @@ Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, + Boolean.FALSE, Boolean.FALSE }; @@ -262,6 +268,12 @@ */ protected boolean fStrictURI; + /** + * allow badly encoded characters (skip them) + * http://apache.org/xml/features/ignore-badly-encoded-chars + */ + protected boolean fAllowBadlyEncodedChars; + // properties /** @@ -1310,6 +1322,13 @@ fStrictURI = false; } + try { + fAllowBadlyEncodedChars = componentManager.getFeature(IGNORE_BADLY_ENCODED_CHARS); + } + catch (XMLConfigurationException e) { + fAllowBadlyEncodedChars = false; + } + // xerces properties fSymbolTable = (SymbolTable)componentManager.getProperty(SYMBOL_TABLE); fErrorReporter = (XMLErrorReporter)componentManager.getProperty(ERROR_REPORTER); @@ -2082,6 +2101,33 @@ protected Reader createReader(InputStream inputStream, String encoding, Boolean isBigEndian) throws IOException { + Reader internalReader = createInternalReader(inputStream, encoding, isBigEndian); + if (fAllowBadlyEncodedChars) + { + // Wrap the reader so that bad characters are ignored rather than causing aborts + return new LaxReader(internalReader); + } + return internalReader; + } + + /** + * Creates a reader capable of reading the given input stream in + * the specified encoding. + * + * @param inputStream The input stream. + * @param encoding The encoding name that the input stream is + * encoded using. If the user has specified that + * Java encoding names are allowed, then the + * encoding name may be a Java encoding name; + * otherwise, it is an ianaEncoding name. + * @param isBigEndian For encodings (like uCS-4), whose names cannot + * specify a byte order, this tells whether the order is bigEndian. null menas + * unknown or not relevant. + * + * @return Returns a reader. + */ + protected Reader createInternalReader(InputStream inputStream, String encoding, Boolean isBigEndian) + throws IOException { // if the encoding is UTF-8 use the optimized UTF-8 reader if (encoding == "UTF-8" || encoding == null) { if (DEBUG_ENCODINGS) { @@ -3025,6 +3071,9 @@ return -1; } if (fOffset == fData.length) { + if (fCurrentEntity.mayReadChunks) { + return fInputStream.read(); + } byte[] newData = new byte[fOffset << 1]; System.arraycopy(fData, 0, newData, 0, fOffset); fData = newData; @@ -3138,4 +3187,105 @@ } } // end of RewindableInputStream class + protected static class LaxReader extends Reader + { + protected Reader internalReader; + + public LaxReader(Reader internalReader) + { + this.internalReader = internalReader; + } + + public int read() + throws IOException + { + // Since we need to be able to skip ahead at the point of error, and not drop huge amounts on the floor, + // all read operations for this class are channeled through the single-character operation. This is less + // efficient, but hopefully not terribly so. + try + { + return internalReader.read(); + } + catch (org.apache.xerces.impl.io.MalformedByteSequenceException e) + { + // When this fails, it means we detected a bad character. + // However, the bad character has already been pulled off the stream, so we are free to stuff in a "?" and + // just keep going. + return (int)'?'; + } + } + + public int read(char[] cbuf) + throws IOException + { + return read(cbuf,0,cbuf.length); + } + + public int read(char[] cbuf, + int off, + int len) + throws IOException + { + int amtRead = 0; + while (amtRead < len) + { + int cval = read(); + if (cval == -1) + { + if (amtRead == 0) + return -1; + else + return amtRead; + } + cbuf[off++] = (char)cval; + amtRead++; + } + return amtRead; + } + + public long skip(long n) + throws IOException + { + long skipped = 0; + while (skipped < n) + { + int cval = read(); + if (cval == -1) + break; + skipped++; + } + return skipped; + } + + public boolean ready() + throws IOException + { + return internalReader.ready(); + } + + public boolean markSupported() + { + return internalReader.markSupported(); + } + + public void mark(int readAheadLimit) + throws IOException + { + internalReader.mark(readAheadLimit); + } + + public void reset() + throws IOException + { + internalReader.reset(); + } + + public void close() + throws IOException + { + internalReader.close(); + } + } + + } // class XMLEntityManager Modified: incubator/lcf/trunk/upstream/xerces2-j/src/org/apache/xerces/impl/XMLScanner.java URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/upstream/xerces2-j/src/org/apache/xerces/impl/XMLScanner.java?rev=919366&r1=919365&r2=919366&view=diff ============================================================================== --- incubator/lcf/trunk/upstream/xerces2-j/src/org/apache/xerces/impl/XMLScanner.java (original) +++ incubator/lcf/trunk/upstream/xerces2-j/src/org/apache/xerces/impl/XMLScanner.java Fri Mar 5 10:50:32 2010 @@ -823,6 +823,7 @@ String entityName = fEntityScanner.scanName(); if (entityName == null) { reportFatalError("NameRequiredInReference", null); + entityName = "unknown"; } else if (entityDepth == fEntityDepth) { fStringBuffer2.append(entityName); @@ -1027,6 +1028,14 @@ if (XMLChar.isMarkup(c) || c == ']') { fStringBuffer.append((char)fEntityScanner.scanChar()); } + else if (XMLChar.isHighSurrogate(c)) { + scanSurrogates(fStringBuffer); + } + else if (isInvalidLiteral(c)) { + reportFatalError("InvalidCharInSystemID", + new Object[] { Integer.toHexString(c) }); + fEntityScanner.scanChar(); + } } while (fEntityScanner.scanLiteral(quote, ident) != quote); fStringBuffer.append(ident); ident = fStringBuffer;