manifoldcf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1443823 - in /manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml: BOMEncodingDetector.java BOMParseState.java Parser.java XMLEncodingDetector.java XMLParseState.java
Date Fri, 08 Feb 2013 02:06:22 GMT
Author: kwright
Date: Fri Feb  8 02:06:21 2013
New Revision: 1443823

URL: http://svn.apache.org/r1443823
Log:
Write character encoding detection logic.

Added:
    manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMEncodingDetector.java
      - copied, changed from r1443435, manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMParseState.java
    manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/Parser.java
  (with props)
    manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLEncodingDetector.java
  (with props)
    manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLParseState.java
  (with props)
Removed:
    manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMParseState.java

Copied: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMEncodingDetector.java
(from r1443435, manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMParseState.java)
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMEncodingDetector.java?p2=manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMEncodingDetector.java&p1=manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMParseState.java&r1=1443435&r2=1443823&rev=1443823&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMParseState.java
(original)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMEncodingDetector.java
Fri Feb  8 02:06:21 2013
@@ -27,7 +27,7 @@ import java.util.*;
 * Once a preliminary character encoding is determined, an EncodingAccepter is notified,
 * and further bytes are sent to a provided ByteReceiver.
 */
-public class BOMParseState extends SingleByteReceiver implements EncodingDetector
+public class BOMEncodingDetector extends SingleByteReceiver implements EncodingDetector
 {
   protected String encoding = null;
   protected final ByteReceiver overflowByteReceiver;
@@ -37,7 +37,7 @@ public class BOMParseState extends Singl
   * If no receiver is passed in, the detector will stop as soon as the
   * BOM is either seen, or not seen.
   */
-  public BOMParseState(ByteReceiver overflowByteReceiver)
+  public BOMEncodingDetector(ByteReceiver overflowByteReceiver)
   {
     super(8);
     this.overflowByteReceiver = overflowByteReceiver;

Added: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/Parser.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/Parser.java?rev=1443823&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/Parser.java
(added)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/Parser.java
Fri Feb  8 02:06:21 2013
@@ -0,0 +1,147 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.core.fuzzyml;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import java.io.*;
+
+/** This is the main parser class.
+* This class has an entry point for both parsing XML and HTML.  The way the
+* parser works is to accept both an input stream (which the caller is responsible
+* for closing) as well as a CharacterReceiver that will do the actual parsing.
+* This class is responsible mainly for setup and character set detection, 
+*/
+public class Parser
+{
+  
+  /** Constructor.
+  * Someday there will be a constructor which accepts character detection
+  * configuration information, but for now there is none.
+  */
+  public Parser()
+  {
+  }
+  
+  /** Parse an input stream with character set detection.
+  * This method uses BOM (byte order mark) and the xml encoding tag to determine the character
encoding to use.
+  * The caller may pass in a starting character encoding, which functions as the default
if no better determination
+  * is made.
+  *@param startingCharset is the starting character set.  Pass null if this is unknown.
+  *@param inputStream is the input stream.  It is the caller's responsibility to close the
stream when the parse is done.
+  *@param characterReceiver is the character receiver that will actually do the parsing.
+  */
+  public void parseWithCharsetDetection(String startingCharset, InputStream inputStream,
CharacterReceiver characterReceiver)
+    throws IOException, ManifoldCFException
+  {
+    // Wrap the input stream, before we do anything else
+    ReplayableInputStream replayableInputStream = new ReplayableInputStream(inputStream);
+    
+    // First go-around: use the BOM detector with nothing downstream, since we don't know
the character set yet.
+    BOMEncodingDetector bomEncodingDetector = new BOMEncodingDetector(null);
+    bomEncodingDetector.setEncoding(startingCharset);
+    bomEncodingDetector.setInputStream(replayableInputStream);
+    try
+    {
+      while (true)
+      {
+        if (bomEncodingDetector.dealWithBytes())
+          break;
+      }
+    }
+    finally
+    {
+      bomEncodingDetector.finishUp();
+    }
+    
+    // Update our notion of what the character set is
+    startingCharset = bomEncodingDetector.getEncoding();
+    if (startingCharset == null)
+      startingCharset = "utf-8";
+    // Reset the stream
+    replayableInputStream.restart(false);
+    // Set up a detection chain that includes the XML detector.
+    // BOMEncodingDetector (for BOM detection) -> XMLEncodingDetector (for xml encoding
tag access)
+    XMLEncodingDetector xmlEncodingDetector = new XMLEncodingDetector();
+    xmlEncodingDetector.setEncoding(startingCharset);
+    bomEncodingDetector = new BOMEncodingDetector(new DecodingByteReceiver(1024,startingCharset,xmlEncodingDetector));
+    // Rerun the detection; this should finalize the value.
+    bomEncodingDetector.setInputStream(replayableInputStream);
+    try
+    {
+      while (true)
+      {
+        if (bomEncodingDetector.dealWithBytes())
+          break;
+      }
+    }
+    finally
+    {
+      bomEncodingDetector.finishUp();
+    }
+
+    // Get the final charset determination
+    startingCharset = xmlEncodingDetector.getEncoding();
+    // Reset for the final time
+    replayableInputStream.restart(true);
+    // Set up the whole chain and parse
+    bomEncodingDetector = new BOMEncodingDetector(new DecodingByteReceiver(65536,startingCharset,characterReceiver));
+    bomEncodingDetector.setInputStream(replayableInputStream);
+    try
+    {
+      while (true)
+      {
+        if (bomEncodingDetector.dealWithBytes())
+          break;
+      }
+    }
+    finally
+    {
+      bomEncodingDetector.finishUp();
+    }
+  }
+  
+  /** Parse an input stream without character set detection.
+  *@param startingCharset is the starting character set.  If null is passed, the code will
presume utf-8.
+  *@param inputStream is the input stream.  It is the caller's responsibility to close the
stream when the parse is done.
+  *@param characterReceiver is the character receiver that will actually do the parsing.
+  */
+  public void parseWithoutCharsetDetection(String startingCharset, InputStream inputStream,
CharacterReceiver characterReceiver)
+    throws IOException, ManifoldCFException
+  {
+    if (startingCharset == null)
+      startingCharset = "utf-8";
+    ByteReceiver byteReceiver = new DecodingByteReceiver(65536, startingCharset, characterReceiver);
+    // Set the input stream.
+    byteReceiver.setInputStream(inputStream);
+    try
+    {
+      // Process until done
+      while (true)
+      {
+        if (byteReceiver.dealWithBytes())
+          break;
+      }
+    }
+    finally
+    {
+      byteReceiver.finishUp();
+    }
+  }
+
+}

Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/Parser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/Parser.java
------------------------------------------------------------------------------
    svn:keywords = Id

Added: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLEncodingDetector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLEncodingDetector.java?rev=1443823&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLEncodingDetector.java
(added)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLEncodingDetector.java
Fri Feb  8 02:06:21 2013
@@ -0,0 +1,170 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.core.fuzzyml;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import java.util.*;
+import java.io.*;
+
+/** This is the XML encoding detector.
+* It is basically looking for the preamble's <?xml ... ?> tag, which it parses
+* looking for the "encoding" attribute.  It stops either when it is beyond
+* any possibility of finding the preamble, or it finds the tag, whichever comes first.
+*/
+public class XMLEncodingDetector extends XMLParseState implements EncodingDetector
+{
+  
+  protected String encoding = null;
+  
+  /** Constructor.
+  */
+  public XMLEncodingDetector()
+  {
+  }
+  
+  /** Set initial encoding.
+  */
+  @Override
+  public void setEncoding(String encoding)
+  {
+    this.encoding = encoding;
+  }
+
+  /** Retrieve final encoding determination.
+  */
+  @Override
+  public String getEncoding()
+  {
+    return encoding;
+  }
+
+  /** Map version of the noteTag method.
+  *@return true to halt further processing.
+  */
+  @Override
+  protected boolean noteTag(String tagName, Map<String,String> attributes)
+    throws ManifoldCFException
+  {
+    // Terminate immediately.
+    return true;
+  }
+  
+  /** This method gets called for every end tag.  Override this method to intercept tag ends.
+  *@return true to halt further processing.
+  */
+  @Override
+  protected boolean noteEndTag(String tagName)
+    throws ManifoldCFException
+  {
+    return true;
+  }
+
+  /** Map version of noteQTag method.
+  *@return true to halt further processing.
+  */
+  protected boolean noteQTag(String tagName, Map<String,String> attributes)
+    throws ManifoldCFException
+  {
+    if (tagName.equals("xml"))
+    {
+      // Look for "encoding" attribute
+      String value = attributes.get("encoding");
+      if (value != null)
+        encoding = value;
+    }
+    // Either way, stop now.
+    return true;
+  }
+
+  /** This method is called for every <! <token> ... > construct, or 'btag'.
+  * Override it to intercept these.
+  *@return true to halt further processing.
+  */
+  @Override
+  protected boolean noteBTag(String tagName)
+    throws ManifoldCFException
+  {
+    return true;
+  }
+
+  /** This method is called for the end of every btag, or any time
+  * there's a naked '>' in the document.  Override it if you want to intercept these.
+  *@return true to halt further processing.
+  */
+  protected boolean noteEndBTag()
+    throws ManifoldCFException
+  {
+    return true;
+  }
+  
+  /** Called for the start of every cdata-like tag, e.g. <![ <token> [ ... ]]>
+  *@param token may be empty!!!
+  *@return true to halt further processing.
+  */
+  @Override
+  protected boolean noteEscaped(String token)
+    throws ManifoldCFException
+  {
+    return true;
+  }
+
+  /** Called for the end of every cdata-like tag.
+  *@return true to halt further processing.
+  */
+  @Override
+  protected boolean noteEndEscaped()
+    throws ManifoldCFException
+  {
+    return true;
+  }
+
+  /** This method gets called for every token inside a btag.
+  *@return true to halt further processing.
+  */
+  @Override
+  protected boolean noteBTagToken(String token)
+    throws ManifoldCFException
+  {
+    return true;
+  }
+  
+  /** This method gets called for every character that is not part of a tag etc.
+  * Override this method to intercept such characters.
+  *@return true to halt further processing.
+  */
+  @Override
+  protected boolean noteNormalCharacter(char thisChar)
+    throws ManifoldCFException
+  {
+    return true;
+  }
+
+  /** This method gets called for every character that is found within an
+  * escape block, e.g. CDATA.
+  * Override this method to intercept such characters.
+  *@return true to halt further processing.
+  */
+  @Override
+  protected boolean noteEscapedCharacter(char thisChar)
+    throws ManifoldCFException
+  {
+    return true;
+  }
+  
+}

Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLEncodingDetector.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLEncodingDetector.java
------------------------------------------------------------------------------
    svn:keywords = Id

Added: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLParseState.java?rev=1443823&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLParseState.java
(added)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLParseState.java
Fri Feb  8 02:06:21 2013
@@ -0,0 +1,93 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.core.fuzzyml;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.manifoldcf.core.system.Logging;
+
+import java.util.*;
+import java.io.*;
+
+/** This class takes the output of the basic tag parser and converts it for
+* typical XML usage.  It takes the attribute lists, for instance, and converts
+* them to case-sensitive maps.
+*/
+public class XMLParseState extends TagParseState
+{
+  
+  /** Constructor.
+  */
+  public XMLParseState()
+  {
+  }
+  
+  /** This method gets called for every tag.  Override this method to intercept tag begins.
+  *@return true to halt further processing.
+  */
+  @Override
+  protected final boolean noteTag(String tagName, List<AttrNameValue> attributes)
+    throws ManifoldCFException
+  {
+    Map<String,String> attrMap = new HashMap<String,String>(attributes.size());
+    for (AttrNameValue nv : attributes)
+    {
+      attrMap.put(nv.getName(), nv.getValue());
+    }
+    return noteTag(tagName, attrMap);
+  }
+
+  /** Map version of the noteTag method.
+  *@return true to halt further processing.
+  */
+  protected boolean noteTag(String tagName, Map<String,String> attributes)
+    throws ManifoldCFException
+  {
+    if (Logging.misc.isDebugEnabled())
+      Logging.misc.debug(" Saw tag '"+tagName+"'");
+    return false;
+  }
+
+  /** This method is called for every <? ... ?> construct, or 'qtag'.
+  * Override it to intercept such constructs.
+  *@return true to halt further processing.
+  */
+  @Override
+  protected final boolean noteQTag(String tagName, List<AttrNameValue> attributes)
+    throws ManifoldCFException
+  {
+    Map<String,String> attrMap = new HashMap<String,String>(attributes.size());
+    for (AttrNameValue nv : attributes)
+    {
+      attrMap.put(nv.getName(), nv.getValue());
+    }
+    return noteQTag(tagName, attrMap);
+  }
+  
+  /** Map version of noteQTag method.
+  *@return true to halt further processing.
+  */
+  protected boolean noteQTag(String tagName, Map<String,String> attributes)
+    throws ManifoldCFException
+  {
+    if (Logging.misc.isDebugEnabled())
+      Logging.misc.debug(" Saw QTag '"+tagName+"'");
+    return false;
+  }
+
+}

Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLParseState.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLParseState.java
------------------------------------------------------------------------------
    svn:keywords = Id



Mime
View raw message