manifoldcf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1444375 - in /manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml: XMLFuzzyHierarchicalParseState.java XMLFuzzyParseState.java XMLParsingContext.java
Date Sat, 09 Feb 2013 14:57:23 GMT
Author: kwright
Date: Sat Feb  9 14:57:23 2013
New Revision: 1444375

URL: http://svn.apache.org/r1444375
Log:
Add base XML parsing classes equivalent to what we've been using with the SAX parser.

Added:
    manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLFuzzyHierarchicalParseState.java
  (with props)
    manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLFuzzyParseState.java
  (with props)
    manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLParsingContext.java
  (with props)

Added: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLFuzzyHierarchicalParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLFuzzyHierarchicalParseState.java?rev=1444375&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLFuzzyHierarchicalParseState.java
(added)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLFuzzyHierarchicalParseState.java
Sat Feb  9 14:57:23 2013
@@ -0,0 +1,141 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.core.fuzzyml;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import java.io.*;
+import java.util.*;
+
+/** Class to keep track of XML hierarchy in the face of possibly corrupt
+* XML and with case-insensitive tags, etc.
+* Basically, this class accepts what is supposedly XML but allows for various
+* kinds of handwritten corruption.  Specific kinds of errors allowed include:
+*
+* - Bad character encoding
+* - Tag case match problems; all attributes are (optionally) bashed to lower case,
+*    and tag names are checked to match when all lower case, if case-sensitive didn't
+*    work
+* - End tag matching problems, where someone lost an end tag somehow
+* - Other parsing recoveries to be added as they arise
+*
+* The functionality of this class is also somewhat lessened vs. standard
+* SAX-type parsers.  No namespace interpretation is done, for instance; tag qnames
+* are split into namespace name and local name, and that's all folks.  But if you need
+* more power, you can write a class extension that will do that readily.
+*/
+public class XMLFuzzyHierarchicalParseState extends XMLFuzzyParseState
+{
+  /** The current context */
+  protected XMLParsingContext currentContext = null;
+
+  /** Constructor.
+  */
+  public XMLFuzzyHierarchicalParseState(boolean lowerCaseAttributes, boolean lowerCaseTags,
+    boolean lowerCaseQAttributes, boolean lowerCaseQTags,
+    boolean lowerCaseBTags, boolean lowerCaseEscapeTags)
+  {
+    super(lowerCaseAttributes,lowerCaseTags,lowerCaseQAttributes,lowerCaseQTags,lowerCaseBTags,lowerCaseEscapeTags);
+  }
+  
+  public void setContext(XMLParsingContext context)
+  {
+    currentContext = context;
+  }
+
+  public XMLParsingContext getContext()
+  {
+    return currentContext;
+  }
+
+  /** Call this method to clean up completely after a parse attempt, whether successful or
failure. */
+  public void cleanup()
+    throws ManifoldCFException
+  {
+    // This sets currentContext == null as a side effect, unless an error occurs during cleanup!!
+    currentContext.cleanup();
+  }
+
+  /** Map version of the noteTag method.
+  *@return true to halt further processing.
+  */
+  @Override
+  protected boolean noteTagEx(String tagName, String nameSpace, String localName, Map<String,String>
attributes)
+    throws ManifoldCFException
+  {
+    // MHL
+    return false;
+  }
+
+  /** Note end tag.
+  */
+  @Override
+  protected boolean noteEndTagEx(String tagName, String nameSpace, String localName)
+    throws ManifoldCFException
+  {
+    // MHL
+    return false;
+  }
+
+  /** This method gets called for every character that is not part of a tag etc.
+  * Override this method to intercept such characters.
+  *@return true to halt further processing.
+  */
+  @Override
+  protected boolean noteNormalCharacter(char thisChar)
+    throws ManifoldCFException
+  {
+    // MHL
+    return false;
+  }
+
+  /** New version of the noteEscapedTag method.
+  *@return true to halt further processing.
+  */
+  protected boolean noteEscapedEx(String token)
+    throws ManifoldCFException
+  {
+    // MHL
+    return false;
+  }
+  
+  /** This method gets called for every character that is found within an
+  * escape block, e.g. CDATA.
+  * Override this method to intercept such characters.
+  *@return true to halt further processing.
+  */
+  @Override
+  protected boolean noteEscapedCharacter(char thisChar)
+    throws ManifoldCFException
+  {
+    // MHL
+    return false;
+  }
+
+  /** Called for the end of every cdata-like tag.
+  *@return true to halt further processing.
+  */
+  @Override
+  protected boolean noteEndEscaped()
+    throws ManifoldCFException
+  {
+    // MHL
+    return false;
+  }
+  
+}

Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLFuzzyHierarchicalParseState.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLFuzzyHierarchicalParseState.java
------------------------------------------------------------------------------
    svn:keywords = Id

Added: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLFuzzyParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLFuzzyParseState.java?rev=1444375&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLFuzzyParseState.java
(added)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLFuzzyParseState.java
Sat Feb  9 14:57:23 2013
@@ -0,0 +1,247 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.core.fuzzyml;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.manifoldcf.core.system.Logging;
+
+import java.util.*;
+import java.io.*;
+
+/** Class to keep track of XML hierarchy in the face of possibly corrupt
+* XML and with case-insensitive tags, etc.
+* Basically, this class accepts what is supposedly XML but allows for various
+* kinds of handwritten corruption.  Specific kinds of errors allowed include:
+*
+* - Bad character encoding
+* - Tag case match problems; all attributes are (optionally) bashed to lower case
+* - Other parsing recoveries to be added as they arise
+*
+* The functionality of this class is also somewhat lessened vs. standard
+* SAX-type parsers.  No namespace interpretation is done, for instance; tag qnames
+* are split into namespace name and local name, and that's all folks.  But if you need
+* more power, you can write a class extension that will do that readily.
+*/
+public class XMLFuzzyParseState extends TagParseState
+{
+  protected final boolean lowerCaseAttributes;
+  protected final boolean lowerCaseTags;
+  protected final boolean lowerCaseQAttributes;
+  protected final boolean lowerCaseQTags;
+  protected final boolean lowerCaseBTags;
+  protected final boolean lowerCaseEscapeTags;
+
+  /** Constructor.
+  */
+  public XMLFuzzyParseState(boolean lowerCaseAttributes, boolean lowerCaseTags,
+    boolean lowerCaseQAttributes, boolean lowerCaseQTags,
+    boolean lowerCaseBTags, boolean lowerCaseEscapeTags)
+  {
+    this.lowerCaseAttributes = lowerCaseAttributes;
+    this.lowerCaseTags = lowerCaseTags;
+    this.lowerCaseQAttributes = lowerCaseQAttributes;
+    this.lowerCaseQTags = lowerCaseQTags;
+    this.lowerCaseBTags = lowerCaseBTags;
+    this.lowerCaseEscapeTags = lowerCaseEscapeTags;
+  }
+  
+  /** This method gets called for every tag.  Override this method to intercept tag begins.
+  *@return true to halt further processing.
+  */
+  @Override
+  protected final boolean noteTag(String tagName, List<AttrNameValue> attributes)
+    throws ManifoldCFException
+  {
+    Map<String,String> attrMap = new HashMap<String,String>(attributes.size());
+    for (AttrNameValue nv : attributes)
+    {
+      String name = nv.getName();
+      if (lowerCaseAttributes)
+        name = nv.getName().toLowerCase(Locale.ROOT);
+      attrMap.put(name, nv.getValue());
+    }
+    if (lowerCaseTags)
+      tagName = tagName.toLowerCase(Locale.ROOT);
+    int index = tagName.indexOf(":");
+    String nameSpace;
+    String localName;
+    if (index == -1)
+    {
+      localName = tagName;
+      nameSpace = null;
+    }
+    else
+    {
+      localName = tagName.substring(index+1);
+      nameSpace = tagName.substring(0,index);
+    }
+    return noteTagEx(tagName, nameSpace, localName, attrMap);
+  }
+
+  /** Map version of the noteTag method.
+  *@return true to halt further processing.
+  */
+  protected boolean noteTagEx(String tagName, String nameSpace, String localName, Map<String,String>
attributes)
+    throws ManifoldCFException
+  {
+    if (Logging.misc.isDebugEnabled())
+      Logging.misc.debug(" Saw tag '"+tagName+"'");
+    return false;
+  }
+
+  /** This method gets called for every end tag.  Override this method to intercept tag ends.
+  *@return true to halt further processing.
+  */
+  @Override
+  protected final boolean noteEndTag(String tagName)
+    throws ManifoldCFException
+  {
+    if (lowerCaseTags)
+      tagName = tagName.toLowerCase(Locale.ROOT);
+    int index = tagName.indexOf(":");
+    String nameSpace;
+    String localName;
+    if (index == -1)
+    {
+      localName = tagName;
+      nameSpace = null;
+    }
+    else
+    {
+      localName = tagName.substring(index+1);
+      nameSpace = tagName.substring(0,index);
+    }
+
+    return noteEndTagEx(tagName, nameSpace, localName);
+  }
+
+  /** Note end tag.
+  */
+  protected boolean noteEndTagEx(String tagName, String nameSpace, String localName)
+    throws ManifoldCFException
+  {
+    if (Logging.misc.isDebugEnabled())
+      Logging.misc.debug(" Saw end tag '"+tagName+"'");
+    return false;
+  }
+  
+  /** This method is called for every <? ... ?> construct, or 'qtag'.
+  * This is not useful for HTML.
+  *@return true to halt further processing.
+  */
+  @Override
+  protected final boolean noteQTag(String tagName, List<AttrNameValue> attributes)
+    throws ManifoldCFException
+  {
+    Map<String,String> attrMap = new HashMap<String,String>(attributes.size());
+    for (AttrNameValue nv : attributes)
+    {
+      String name = nv.getName();
+      if (lowerCaseQAttributes)
+        name = nv.getName().toLowerCase(Locale.ROOT);
+      attrMap.put(name, nv.getValue());
+    }
+    if (lowerCaseQTags)
+      tagName = tagName.toLowerCase(Locale.ROOT);
+    return noteQTagEx(tagName, attrMap);
+  }
+
+  /** Map version of the noteQTag method.
+  *@return true to halt further processing.
+  */
+  protected boolean noteQTagEx(String tagName, Map<String,String> attributes)
+    throws ManifoldCFException
+  {
+    if (Logging.misc.isDebugEnabled())
+      Logging.misc.debug(" Saw QTag '"+tagName+"'");
+    return false;
+  }
+
+  /** This method is called for every <! <token> ... > construct, or 'btag'.
+  * Override it to intercept these.
+  *@return true to halt further processing.
+  */
+  @Override
+  protected final boolean noteBTag(String tagName)
+    throws ManifoldCFException
+  {
+    if (lowerCaseBTags)
+      tagName = tagName.toLowerCase(Locale.ROOT);
+    return noteBTagEx(tagName);
+  }
+
+  /** New version of the noteBTag method.
+  *@return true to halt further processing.
+  */
+  protected boolean noteBTagEx(String tagName)
+    throws ManifoldCFException
+  {
+    if (Logging.misc.isDebugEnabled())
+      Logging.misc.debug(" Saw BTag '"+tagName+"'");
+    return false;
+  }
+
+  /** Called for the start of every cdata-like tag, e.g. <![ <token> [ ... ]]>
+  *@param token may be empty!!!
+  *@return true to halt further processing.
+  */
+  @Override
+  protected final boolean noteEscaped(String token)
+    throws ManifoldCFException
+  {
+    if (lowerCaseEscapeTags && token != null)
+      token = token.toLowerCase(Locale.ROOT);
+    return noteEscapedEx(token);
+  }
+
+  /** New version of the noteEscapedTag method.
+  *@return true to halt further processing.
+  */
+  protected boolean noteEscapedEx(String token)
+    throws ManifoldCFException
+  {
+    if (Logging.misc.isDebugEnabled())
+      Logging.misc.debug(" Saw Escaped '"+token+"'");
+    return false;
+  }
+
+  /** This method gets called for every token inside a btag.
+  *@return true to halt further processing.
+  */
+  @Override
+  protected final boolean noteBTagToken(String token)
+    throws ManifoldCFException
+  {
+    if (lowerCaseBTags)
+      token = token.toLowerCase(Locale.ROOT);
+    return noteBTagTokenEx(token);
+  }
+
+  /** New version of the noteBTagToken method.
+  *@return true to halt further processing.
+  */
+  protected boolean noteBTagTokenEx(String token)
+    throws ManifoldCFException
+  {
+    if (Logging.misc.isDebugEnabled())
+      Logging.misc.debug(" Saw BTag token '"+token+"'");
+    return false;
+  }
+
+}

Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLFuzzyParseState.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLFuzzyParseState.java
------------------------------------------------------------------------------
    svn:keywords = Id

Added: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLParsingContext.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLParsingContext.java?rev=1444375&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLParsingContext.java
(added)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLParsingContext.java
Sat Feb  9 14:57:23 2013
@@ -0,0 +1,161 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.core.fuzzyml;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import java.io.*;
+import java.util.*;
+
+/** An instance of this class represents a parsing context within a node.  Parsing functionality
is implemented
+* by extending this class to do the right thing for the context in which it is deployed.
 The base functionality
+* "does nothing"; extended functionality is needed to interpret nodes and act upon them.
+*/
+public class XMLParsingContext
+{
+  /** The stream we belong to */
+  protected final XMLFuzzyHierarchicalParseState theStream;
+  /** The previous context */
+  protected final XMLParsingContext previousContext;
+  /** The attributes belonging to the node associated with this context */
+  protected final Map<String,String> theseAttributes;
+  /** The namespace associated with the context */
+  protected final String namespace;
+  /** The localname associated with the context */
+  protected final String localname;
+  /** The qname associated with the context */
+  protected final String qname;
+
+  /** Root constructor.  Used for outer document level. */
+  public XMLParsingContext(XMLFuzzyHierarchicalParseState theStream)
+  {
+    this(theStream,null,null,null,null);
+  }
+
+  /** Full constructor.  Used for individual tags. */
+  public XMLParsingContext(XMLFuzzyHierarchicalParseState theStream, String namespace, String
localname, String qname, Map<String,String> theseAttributes)
+  {
+    this.theStream = theStream;
+    this.previousContext = theStream.getContext();
+    this.namespace = namespace;
+    this.localname = localname;
+    this.qname = qname;
+    this.theseAttributes = theseAttributes;
+  }
+
+  /** Get an attribute's value, if any */
+  public String getAttribute(String attributeName)
+  {
+    if (theseAttributes != null)
+      return theseAttributes.get(attributeName);
+    else
+      return null;
+  }
+
+  /** Get the namespace name of this node */
+  public String getNamespace()
+  {
+    return namespace;
+  }
+
+  /** Get the localname of this node */
+  public String getLocalname()
+  {
+    return localname;
+  }
+
+  /** Get the qname of this node */
+  public String getQname()
+  {
+    return qname;
+  }
+
+  /** Handle the start of a tag */
+  public final void startElement(String namespace, String localName, String qName, Map<String,String>
atts)
+    throws ManifoldCFException
+  {
+    // For every child tag, we must create a new context.  We call a stub method to do that
here; the stub method is meant
+    // to be overridden to provide the proper non-default context, where desired.
+    XMLParsingContext newContext = beginTag(namespace, localName, qName, atts);
+    if (newContext == null)
+      newContext = new XMLParsingContext(theStream,namespace,localName,qName,atts);
+    // We need to establish the new context in the stack of the owning XMLStream object
+    theStream.setContext(newContext);
+  }
+
+  /** Handle the end of a tag */
+  public final void endElement(String namespace, String localName, String qName)
+    throws ManifoldCFException
+  {
+    // When a child tag ends, pop back to the previous context.  That will allow the current
one to go away.  But first, call
+    // a stub method that can be overridden to perform activities.
+    // Signal the end of the tag.  This goes last, because we have to do things in the reverse
order from the
+    // way the context got pushed to make sense.
+    if (previousContext != null)
+      previousContext.endTag();
+    // Before we leave the child context, clean up the child tag itself, but not the whole
chain
+    theStream.getContext().tagCleanup();
+    // Go back to the parent context
+    theStream.setContext(previousContext);
+  }
+
+  /** Handle content of a tag */
+  public final void characters(String contents)
+    throws ManifoldCFException
+  {
+    // Call the overridden method with the right context object
+    tagContents(contents);
+  }
+
+  /** Cleanup this context object, and then recurse up the chain.
+  * This method is called without fail at the end of any parse, whether it errored out or
not, so that proper cleanup always happens for any tags left on the stack.
+  */
+  public final void cleanup()
+    throws ManifoldCFException
+  {
+    tagCleanup();
+    theStream.setContext(previousContext);
+  }
+
+  /** This method is meant to be extended by classes that extend this class.  The form of
this method is meant to enable creation of a
+  * context object derived from XMLContext that understands how to actually handle tags and
content within the current context. */
+  protected XMLParsingContext beginTag(String namespace, String localName, String qName,
Map<String,String> atts)
+    throws ManifoldCFException
+  {
+    // The default action is to establish a new default context.
+    return null;
+  }
+
+  /** This method is meant to be extended by classes that extend this class */
+  protected void endTag()
+    throws ManifoldCFException
+  {
+  }
+
+  /** This method is meant to be extended by classes that extend this class */
+  protected void tagContents(String contents)
+    throws ManifoldCFException
+  {
+  }
+
+  /** Override this method to be called during cleanup */
+  protected void tagCleanup()
+    throws ManifoldCFException
+  {
+  }
+}

Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLParsingContext.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLParsingContext.java
------------------------------------------------------------------------------
    svn:keywords = Id



Mime
View raw message