abdera-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jmsn...@apache.org
Subject svn commit: r559700 - in /incubator/abdera/java/trunk: core/src/main/java/org/apache/abdera/parser/ core/src/main/java/org/apache/abdera/util/ parser/src/main/java/org/apache/abdera/parser/stax/ parser/src/main/java/org/apache/abdera/parser/stax/util/
Date Thu, 26 Jul 2007 06:08:33 GMT
Author: jmsnell
Date: Wed Jul 25 23:08:32 2007
New Revision: 559700

URL: http://svn.apache.org/viewvc?view=rev&rev=559700
Log:
The XML parser has a tendency of dying when it comes across characters that are 
not valid within XML documents. Go figure.  Unfortunately, invalid characters 
tend to occur quite frequently in syndication feeds.  This provides us with the 
simple option of filtering bad characters out at the Reader level.  Filtering is 
disabled by default and enabled using a ParserOptions property.  The characters 
to filter will be automatically detected given the XML version being used. The
default is to assume XML 1.0

Added:
    incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/util/XmlRestrictedCharFilter.java
    incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/util/FOMXmlRestrictedCharFilter.java
    incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/util/FOMXmlVersionInputStream.java
    incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/util/FOMXmlVersionReader.java
Modified:
    incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/parser/ParserOptions.java
    incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/util/AbstractParserOptions.java
    incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/FOMParser.java
    incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/util/PeekAheadInputStream.java

Modified: incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/parser/ParserOptions.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/parser/ParserOptions.java?view=diff&rev=559700&r1=559699&r2=559700
==============================================================================
--- incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/parser/ParserOptions.java
(original)
+++ incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/parser/ParserOptions.java
Wed Jul 25 23:08:32 2007
@@ -87,4 +87,29 @@
    * and attribute values unless there is an in-scope xml:space="preserve".
    */
   void setMustPreserveWhitespace(boolean preserve);
+  
+  /**
+   * If true, the parser will attempt to silently filter out invalid XML
+   * characters appearing within the XML document.
+   */
+  boolean getFilterRestrictedCharacters();
+  
+  /**
+   * If true, the parser will attempt to silently filter out invalid XML
+   * characters appearing within the XML document
+   */
+  void setFilterRestrictedCharacters(boolean filter);
+  
+  /**
+   * If getFilterRestrictedCharacters is true, restricted characters will
+   * be replaced with the specified character
+   */
+  char getFilterRestrictedCharacterReplacement();
+  
+  /**
+   * If getFilterRestrictedCharacters is true, restricted characters will
+   * be replaced with the specified character
+   */
+  void setFilterRestrictedCharacterReplacement(char replacement);
+  
 }

Modified: incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/util/AbstractParserOptions.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/util/AbstractParserOptions.java?view=diff&rev=559700&r1=559699&r2=559700
==============================================================================
--- incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/util/AbstractParserOptions.java
(original)
+++ incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/util/AbstractParserOptions.java
Wed Jul 25 23:08:32 2007
@@ -32,6 +32,8 @@
   protected ParseFilter parseFilter = null;
   protected boolean detect = false;
   protected boolean preserve = true;
+  protected boolean filterreserved = false;
+  protected char replacement = 0;
 
   protected abstract void initFactory();
   protected abstract void checkFactory(Factory factory);
@@ -90,4 +92,19 @@
     this.preserve = preserve;
   }
   
+  public boolean getFilterRestrictedCharacters() {
+    return filterreserved;
+  }
+  
+  public void setFilterRestrictedCharacters(boolean filter) {
+    this.filterreserved = filter;
+  }
+  
+  public char getFilterRestrictedCharacterReplacement() {
+    return replacement;
+  }
+  
+  public void setFilterRestrictedCharacterReplacement(char replacement) {
+    this.replacement = replacement;
+  }
 }

Added: incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/util/XmlRestrictedCharFilter.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/util/XmlRestrictedCharFilter.java?view=auto&rev=559700
==============================================================================
--- incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/util/XmlRestrictedCharFilter.java
(added)
+++ incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/util/XmlRestrictedCharFilter.java
Wed Jul 25 23:08:32 2007
@@ -0,0 +1,155 @@
+package org.apache.abdera.util;
+
+import java.io.FilterReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.io.UnsupportedEncodingException;
+
+import org.apache.abdera.i18n.ChainableBitSet;
+
+/**
+ * A reader implementation that filters out characters that are not allowed
+ * in XML 1.0 or XML 1.1 documents.  The default mode is to assume XML 1.0.
+ * 
+ * By default, invalid characters are simply removed from the stream.  
+ * Alternatively, a replacement character can be provided so long as it
+ * is a valid XML character itself.
+ */
+public class XmlRestrictedCharFilter 
+  extends FilterReader {
+
+  /**
+   * The mode determines which set of restrictions to apply depending 
+   * on the XML version being parsed
+   */
+  public enum Mode { XML10, XML11 };
+  
+  private final ChainableBitSet set;
+  private final char replacement;
+  
+  protected XmlRestrictedCharFilter(InputStream in) {
+    this(new InputStreamReader(in));
+  }
+  
+  protected XmlRestrictedCharFilter(
+    InputStream in, 
+    String charset) 
+      throws UnsupportedEncodingException {
+    this(new InputStreamReader(in,charset));
+  }
+  
+  protected XmlRestrictedCharFilter(
+    InputStream in, 
+    Mode mode) {
+      this(new InputStreamReader(in),mode);
+  }
+  
+  protected XmlRestrictedCharFilter(
+    InputStream in, 
+    String charset, 
+    Mode mode) 
+      throws UnsupportedEncodingException {
+    this(new InputStreamReader(in,charset),mode);
+  }
+  
+  protected XmlRestrictedCharFilter(
+    InputStream in, 
+    char replacement) {
+      this(new InputStreamReader(in),replacement);
+  }
+  
+  protected XmlRestrictedCharFilter(
+    InputStream in, 
+    String charset,
+    char replacement) 
+      throws UnsupportedEncodingException {
+    this(new InputStreamReader(in,charset),replacement);
+  }
+  
+  protected XmlRestrictedCharFilter(
+    InputStream in, 
+    Mode mode, 
+    char replacement) {
+      this(new InputStreamReader(in),mode, replacement);
+  }
+  
+  protected XmlRestrictedCharFilter(
+    InputStream in, 
+    String charset, 
+    Mode mode,
+    char replacement) 
+      throws UnsupportedEncodingException {
+    this(new InputStreamReader(in,charset),mode,replacement);
+  }
+  
+  
+  protected XmlRestrictedCharFilter(
+    Reader in) {
+      this(in,Mode.XML10,(char)0);
+  }
+  
+  protected XmlRestrictedCharFilter(
+    Reader in, 
+    Mode mode) {
+      this(in,mode,(char)0);
+  }
+  
+  protected XmlRestrictedCharFilter(
+    Reader in, 
+    char replacement) {
+      this(in,Mode.XML10,replacement);
+  }
+  
+  protected XmlRestrictedCharFilter(
+    Reader in, 
+    Mode mode, 
+    char replacement) {
+      super(in);
+      this.set = mode == Mode.XML10 ? restrictedchar10 : restrictedchar11;
+      this.replacement = replacement;
+      if (replacement != 0 && 
+          ((!Character.isValidCodePoint(replacement)) || 
+          set.get(replacement))) 
+            throw new IllegalArgumentException();
+  }
+
+  @Override
+  public int read() throws IOException {
+    int c = -1;
+    if (replacement == 0) {
+      while(((c = super.read()) != -1 && set.get(c))) {}
+    } else {
+      c = super.read();
+      if (c != -1 && set.get(c)) c = replacement;
+    }
+    return c;
+  }
+
+  @Override
+  public int read(char[] cbuf, int off, int len) throws IOException {
+    int n = off;
+    for (; n < Math.min(len,cbuf.length-off); n++) {
+      int r = read();
+      if (r != -1) cbuf[n] = (char)r;
+      else break;
+    }
+    return n - off;
+  }
+
+  private final ChainableBitSet restrictedchar10 =
+    new ChainableBitSet().set2(0, 8)
+                         .set2(11, 12)
+                         .set2(14, 31)
+                         .set2(55296, 57343)
+                         .set2(65534, 65535);
+
+  private final ChainableBitSet restrictedchar11 = 
+    new ChainableBitSet().set2(0, 8)
+                         .set2(11, 12)
+                         .set2(14, 31)
+                         .set2(127, 159)
+                         .set2(55296, 57343)
+                         .set2(65534, 65535);
+}

Modified: incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/FOMParser.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/FOMParser.java?view=diff&rev=559700&r1=559699&r2=559700
==============================================================================
--- incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/FOMParser.java
(original)
+++ incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/FOMParser.java
Wed Jul 25 23:08:32 2007
@@ -30,6 +30,7 @@
 import org.apache.abdera.parser.Parser;
 import org.apache.abdera.parser.ParserOptions;
 import org.apache.abdera.parser.stax.util.FOMSniffingInputStream;
+import org.apache.abdera.parser.stax.util.FOMXmlRestrictedCharFilter;
 import org.apache.abdera.util.AbstractParser;
 import org.apache.abdera.util.Messages;
 import org.apache.abdera.i18n.iri.IRI;
@@ -99,10 +100,19 @@
         if (charset != null) options.setCharset(charset);
         in = sin;
       }
-      XMLStreamReader xmlreader = (charset == null) ? 
-        StAXUtils.createXMLStreamReader(in) : 
-        StAXUtils.createXMLStreamReader(in, charset); 
-      return parse(xmlreader, base, options);
+      if (options.getFilterRestrictedCharacters()) {
+        Reader rdr = (charset == null) ? 
+          new FOMXmlRestrictedCharFilter(
+            in,options.getFilterRestrictedCharacterReplacement()) :
+          new FOMXmlRestrictedCharFilter(
+            in,charset,options.getFilterRestrictedCharacterReplacement());
+        return parse(StAXUtils.createXMLStreamReader(rdr), base, options);
+      } else {
+        XMLStreamReader xmlreader = (charset == null) ? 
+          StAXUtils.createXMLStreamReader(in) : 
+          StAXUtils.createXMLStreamReader(in, charset); 
+        return parse(xmlreader, base, options);
+      }
     } catch (Exception e) {
       if (!(e instanceof ParseException))
         e = new ParseException(e);
@@ -119,6 +129,11 @@
       throw new IllegalArgumentException(Messages.get("READER.NOT.NULL"));
     try {
       if (options == null) options = getDefaultParserOptions();
+      if (options.getFilterRestrictedCharacters() && 
+          !(in instanceof FOMXmlRestrictedCharFilter)) {
+        in = new FOMXmlRestrictedCharFilter(
+          in,options.getFilterRestrictedCharacterReplacement());
+      }
       return parse(StAXUtils.createXMLStreamReader(in), base, options);
     } catch (Exception e) {
       if (!(e instanceof ParseException))

Added: incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/util/FOMXmlRestrictedCharFilter.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/util/FOMXmlRestrictedCharFilter.java?view=auto&rev=559700
==============================================================================
--- incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/util/FOMXmlRestrictedCharFilter.java
(added)
+++ incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/util/FOMXmlRestrictedCharFilter.java
Wed Jul 25 23:08:32 2007
@@ -0,0 +1,110 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  The ASF licenses this file to You
+* under the Apache License, Version 2.0 (the "License"); you may not
+* use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.  For additional information regarding
+* copyright in this work, please see the NOTICE file in the top level
+* directory of this distribution.
+*/
+package org.apache.abdera.parser.stax.util;
+
+import java.io.InputStream;
+import java.io.Reader;
+import java.io.UnsupportedEncodingException;
+
+import org.apache.abdera.util.XmlRestrictedCharFilter;
+
+public final class FOMXmlRestrictedCharFilter 
+  extends XmlRestrictedCharFilter {
+
+  public FOMXmlRestrictedCharFilter(
+    Reader in) {
+      this(new FOMXmlVersionReader(in));
+  }
+  
+  public FOMXmlRestrictedCharFilter(
+    FOMXmlVersionReader in) {
+      super(in,getMode(in.getVersion()));
+  }
+  
+  public FOMXmlRestrictedCharFilter(
+    Reader in, 
+    char replacement) {
+      this(new FOMXmlVersionReader(in), replacement);
+  }
+  
+  public FOMXmlRestrictedCharFilter(
+    FOMXmlVersionReader in, 
+    char replacement) {
+      super(in,getMode(in.getVersion()), replacement);
+  }
+
+  public FOMXmlRestrictedCharFilter(
+    InputStream in) {
+      this(new FOMXmlVersionInputStream(in));
+  }
+  
+  public FOMXmlRestrictedCharFilter(
+    FOMXmlVersionInputStream in) {
+      super(in,getMode(in.getVersion()));
+  }
+  
+  public FOMXmlRestrictedCharFilter(
+    InputStream in, 
+    char replacement) {
+      this(new FOMXmlVersionInputStream(in), replacement);
+  }
+  
+  public FOMXmlRestrictedCharFilter(
+    FOMXmlVersionInputStream in, 
+    char replacement) {
+      super(in,getMode(in.getVersion()), replacement);
+  }
+
+  public FOMXmlRestrictedCharFilter(
+    InputStream in, 
+    String charset) 
+      throws UnsupportedEncodingException {
+    this(new FOMXmlVersionInputStream(in),charset);
+  }
+  
+  public FOMXmlRestrictedCharFilter(
+    FOMXmlVersionInputStream in, 
+    String charset) 
+      throws UnsupportedEncodingException {
+    super(in,charset,getMode(in.getVersion()));
+  }
+  
+  public FOMXmlRestrictedCharFilter(
+    InputStream in, 
+    String charset, 
+    char replacement) 
+      throws UnsupportedEncodingException {
+    this(new FOMXmlVersionInputStream(in), charset, replacement);
+  }
+  
+  public FOMXmlRestrictedCharFilter(
+    FOMXmlVersionInputStream in, 
+    String charset, 
+    char replacement) 
+      throws UnsupportedEncodingException {
+    super(in,charset, getMode(in.getVersion()), replacement);
+  }
+  
+  private static Mode getMode(String version) {
+    return version == null ? Mode.XML10 :
+           version.equals("1.0") ? Mode.XML10 :
+           version.equals("1.1") ? Mode.XML11 : 
+           Mode.XML10;
+  }
+  
+}

Added: incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/util/FOMXmlVersionInputStream.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/util/FOMXmlVersionInputStream.java?view=auto&rev=559700
==============================================================================
--- incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/util/FOMXmlVersionInputStream.java
(added)
+++ incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/util/FOMXmlVersionInputStream.java
Wed Jul 25 23:08:32 2007
@@ -0,0 +1,62 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  The ASF licenses this file to You
+* under the Apache License, Version 2.0 (the "License"); you may not
+* use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.  For additional information regarding
+* copyright in this work, please see the NOTICE file in the top level
+* directory of this distribution.
+*/
+package org.apache.abdera.parser.stax.util;
+
+import java.io.FilterInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLStreamReader;
+
+/**
+ * Will attempt to autodetect the character encoding from the stream
+ * This will preserve the BOM if it exists
+ */
+public class FOMXmlVersionInputStream 
+  extends FilterInputStream {
+
+  private String version = null;
+  
+  public FOMXmlVersionInputStream(InputStream in) {
+    super(new PeekAheadInputStream(in,4));
+    try {
+      version = detectVersion();
+    } catch (IOException e) {}
+  }
+
+  public String getVersion() {
+    return version;
+  }
+  
+  private String detectVersion() throws IOException {
+    String version = "1.0";
+    PeekAheadInputStream pin = (PeekAheadInputStream) this.in;
+    try { 
+      byte[] p = new byte[200];
+      pin.peek(p);
+      XMLStreamReader xmlreader = 
+        XMLInputFactory.newInstance().createXMLStreamReader(
+          new java.io.ByteArrayInputStream(p));
+      String v = xmlreader.getVersion();
+      if (v != null) version = v;
+    } catch (Exception e) {}
+    return version;
+  }
+  
+}

Added: incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/util/FOMXmlVersionReader.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/util/FOMXmlVersionReader.java?view=auto&rev=559700
==============================================================================
--- incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/util/FOMXmlVersionReader.java
(added)
+++ incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/util/FOMXmlVersionReader.java
Wed Jul 25 23:08:32 2007
@@ -0,0 +1,58 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  The ASF licenses this file to You
+* under the Apache License, Version 2.0 (the "License"); you may not
+* use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.  For additional information regarding
+* copyright in this work, please see the NOTICE file in the top level
+* directory of this distribution.
+*/
+package org.apache.abdera.parser.stax.util;
+
+import java.io.IOException;
+import java.io.PushbackReader;
+import java.io.Reader;
+
+import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLStreamReader;
+
+public class FOMXmlVersionReader 
+  extends PushbackReader {
+
+  private String version = null;
+  
+  public FOMXmlVersionReader(Reader in) {
+    super(in,200);
+    try {
+      version = detectVersion();
+    } catch (IOException e) {}
+  }
+
+  public String getVersion() {
+    return version;
+  }
+  
+  private String detectVersion() throws IOException {
+    String version = "1.0";
+    try { 
+      char[] p = new char[200];
+      int r = read(p);
+      XMLStreamReader xmlreader = 
+        XMLInputFactory.newInstance().createXMLStreamReader(
+          new java.io.CharArrayReader(p));
+      String v = xmlreader.getVersion();
+      if (v != null) version = v;
+      unread(p,0,r);
+      
+    } catch (Exception e) {}
+    return version;
+  }
+}

Modified: incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/util/PeekAheadInputStream.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/util/PeekAheadInputStream.java?view=diff&rev=559700&r1=559699&r2=559700
==============================================================================
--- incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/util/PeekAheadInputStream.java
(original)
+++ incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/util/PeekAheadInputStream.java
Wed Jul 25 23:08:32 2007
@@ -81,6 +81,14 @@
 
   @Override
   public synchronized void unread(byte[] b, int off, int len) throws IOException {
+    // only unread non-null bytes, otherwise, 
+    // we end up stuffing the buffer with a bunch of garbage
+    int c = off;
+    for (;c < Math.min(len, b.length - off);c++) {
+      if (b[c] == 0) break;
+    }
+    len = Math.min(len, c);
+    
     if (len > pos && pos + len > buf.length) {
       resize(len-pos);
       pos += len-pos;



Mime
View raw message