manifoldcf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1444218 - /manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/
Date Fri, 08 Feb 2013 20:29:22 GMT
Author: kwright
Date: Fri Feb  8 20:29:21 2013
New Revision: 1444218

URL: http://svn.apache.org/r1444218
Log:
Hook up moved HTML parser.

Removed:
    manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java
Modified:
    manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java
    manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
    manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/MetaParseState.java
    manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java
    manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java

Modified: manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java?rev=1444218&r1=1444217&r2=1444218&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java
(original)
+++ manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java
Fri Feb  8 20:29:21 2013
@@ -19,6 +19,7 @@
 package org.apache.manifoldcf.crawler.connectors.webcrawler;
 
 import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.manifoldcf.core.fuzzyml.*;
 import java.util.*;
 
 /** This class interprets the tag stream generated by the BasicParseState class, and keeps
track of the form tags. */
@@ -46,10 +47,11 @@ public class FormParseState extends Link
   // Override methods having to do with notification of tag discovery
 
   @Override
-  protected void noteNonscriptTag(String tagName, Map attributes)
+  protected boolean noteNonscriptTag(String tagName, Map<String,String> attributes)
     throws ManifoldCFException
   {
-    super.noteNonscriptTag(tagName,attributes);
+    if (super.noteNonscriptTag(tagName,attributes))
+      return true;
     switch (formParseState)
     {
     case FORMPARSESTATE_NORMAL:
@@ -125,13 +127,15 @@ public class FormParseState extends Link
     default:
       throw new ManifoldCFException("Unknown form parse state: "+Integer.toString(formParseState));
     }
+    return false;
   }
 
   @Override
-  protected void noteNonscriptEndTag(String tagName)
+  protected boolean noteNonscriptEndTag(String tagName)
     throws ManifoldCFException
   {
-    super.noteNonscriptEndTag(tagName);
+    if (super.noteNonscriptEndTag(tagName))
+      return true;
     switch (formParseState)
     {
     case FORMPARSESTATE_NORMAL:
@@ -158,7 +162,7 @@ public class FormParseState extends Link
           optionMap.put("type","select");
           optionMap.put("name",selectName);
           optionMap.put("multiple",selectMultiple);
-          optionMap.put("value",htmlBodyDecode(optionValueText.toString()));
+          optionMap.put("value",bodyDecode(optionValueText.toString()));
           optionMap.put("selected",optionSelected);
           handler.noteFormInput(optionMap);
         }
@@ -175,13 +179,15 @@ public class FormParseState extends Link
     default:
       throw new ManifoldCFException("Unknown form parse state: "+Integer.toString(formParseState));
     }
+    return false;
   }
 
   @Override
-  protected void noteNormalCharacter(char thisChar)
+  protected boolean noteNormalCharacter(char thisChar)
     throws ManifoldCFException
   {
-    super.noteNormalCharacter(thisChar);
+    if (super.noteNormalCharacter(thisChar))
+      return true;
     if (formParseState == FORMPARSESTATE_IN_OPTION)
     {
       if (optionValueText != null)
@@ -189,6 +195,7 @@ public class FormParseState extends Link
     }
     else
       handler.noteTextCharacter(thisChar);
+    return false;
   }
 
 }

Modified: manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java?rev=1444218&r1=1444217&r2=1444218&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
(original)
+++ manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
Fri Feb  8 20:29:21 2013
@@ -19,6 +19,7 @@
 package org.apache.manifoldcf.crawler.connectors.webcrawler;
 
 import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.manifoldcf.core.fuzzyml.*;
 import java.util.*;
 
 /** This class recognizes and interprets all links */
@@ -34,10 +35,11 @@ public class LinkParseState extends Meta
   }
 
   @Override
-  protected void noteNonscriptTag(String tagName, Map attributes)
+  protected boolean noteNonscriptTag(String tagName, Map<String,String> attributes)
     throws ManifoldCFException
   {
-    super.noteNonscriptTag(tagName,attributes);
+    if (super.noteNonscriptTag(tagName,attributes))
+      return true;
     if (tagName.equals("a"))
     {
       String hrefValue = (String)attributes.get("href");
@@ -62,6 +64,7 @@ public class LinkParseState extends Meta
       if (srcValue != null && srcValue.length() > 0)
         handler.noteFRAMESRC(srcValue);
     }
+    return false;
   }
 
 }

Modified: manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/MetaParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/MetaParseState.java?rev=1444218&r1=1444217&r2=1444218&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/MetaParseState.java
(original)
+++ manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/MetaParseState.java
Fri Feb  8 20:29:21 2013
@@ -19,6 +19,7 @@
 package org.apache.manifoldcf.crawler.connectors.webcrawler;
 
 import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.manifoldcf.core.fuzzyml.*;
 import java.util.*;
 
 /** This class recognizes and interprets all meta tags */
@@ -33,14 +34,16 @@ public class MetaParseState extends Scri
   }
 
   @Override
-  protected void noteNonscriptTag(String tagName, Map attributes)
+  protected boolean noteNonscriptTag(String tagName, Map<String,String> attributes)
     throws ManifoldCFException
   {
-    super.noteNonscriptTag(tagName,attributes);
+    if (super.noteNonscriptTag(tagName,attributes))
+      return true;
     if (tagName.equals("meta"))
     {
       handler.noteMetaTag(attributes);
     }
+    return false;
   }
 
 }

Modified: manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java?rev=1444218&r1=1444217&r2=1444218&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java
(original)
+++ manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java
Fri Feb  8 20:29:21 2013
@@ -19,10 +19,11 @@
 package org.apache.manifoldcf.crawler.connectors.webcrawler;
 
 import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.manifoldcf.core.fuzzyml.*;
 import java.util.*;
 
-/** This class interprets the tag stream generated by the BasicParseState class, and causes
script sections to be skipped */
-public class ScriptParseState extends BasicParseState
+/** This class interprets the tag stream generated by the HTMLParseState class, and causes
script sections to be skipped */
+public class ScriptParseState extends HTMLParseState
 {
   // Script tag parsing states
   protected static final int SCRIPTPARSESTATE_NORMAL = 0;
@@ -38,17 +39,19 @@ public class ScriptParseState extends Ba
   // Override methods having to do with notification of tag discovery
 
   @Override
-  protected void noteTag(String tagName, Map attributes)
+  protected boolean noteTag(String tagName, Map<String,String> attributes)
     throws ManifoldCFException
   {
-    super.noteTag(tagName,attributes);
+    if (super.noteTag(tagName,attributes))
+      return true;
     switch (scriptParseState)
     {
     case SCRIPTPARSESTATE_NORMAL:
       if (tagName.equals("script"))
         scriptParseState = SCRIPTPARSESTATE_INSCRIPT;
       else
-        noteNonscriptTag(tagName,attributes);
+        if (noteNonscriptTag(tagName,attributes))
+          return true;
       break;
     case SCRIPTPARSESTATE_INSCRIPT:
       // Skip all tags until we see the end script one.
@@ -56,17 +59,20 @@ public class ScriptParseState extends Ba
     default:
       throw new ManifoldCFException("Unknown script parse state: "+Integer.toString(scriptParseState));
     }
+    return false;
   }
 
   @Override
-  protected void noteEndTag(String tagName)
+  protected boolean noteTagEnd(String tagName)
     throws ManifoldCFException
   {
-    super.noteEndTag(tagName);
+    if (super.noteTagEnd(tagName))
+      return true;
     switch (scriptParseState)
     {
     case SCRIPTPARSESTATE_NORMAL:
-      noteNonscriptEndTag(tagName);
+      if (noteNonscriptEndTag(tagName))
+        return true;
       break;
     case SCRIPTPARSESTATE_INSCRIPT:
       // Skip all tags until we see the end script one.
@@ -76,16 +82,19 @@ public class ScriptParseState extends Ba
     default:
       break;
     }
+    return false;
   }
 
-  protected void noteNonscriptTag(String tagName, Map attributes)
+  protected boolean noteNonscriptTag(String tagName, Map<String,String> attributes)
     throws ManifoldCFException
   {
+    return false;
   }
 
-  protected void noteNonscriptEndTag(String tagName)
+  protected boolean noteNonscriptEndTag(String tagName)
     throws ManifoldCFException
   {
+    return false;
   }
 
 }

Modified: manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1444218&r1=1444217&r2=1444218&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
(original)
+++ manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Fri Feb  8 20:29:21 2013
@@ -24,6 +24,8 @@ import org.apache.manifoldcf.crawler.int
 import org.apache.manifoldcf.crawler.system.Logging;
 import org.apache.manifoldcf.crawler.system.ManifoldCF;
 
+import org.apache.manifoldcf.core.fuzzyml.*;
+
 import org.xml.sax.Attributes;
 
 import org.apache.manifoldcf.core.common.XMLDoc;
@@ -6768,33 +6770,14 @@ public class WebcrawlerConnector extends
         return;
       }
 
-      if (Logging.connectors.isDebugEnabled())
-        Logging.connectors.debug("WEB: Document '"+documentURI+"' is text, with encoding
'"+encoding+"'; link extraction starting");
-
       try
       {
-        // Create a reader for the described encoding, if that's possible
-        Reader r = new InputStreamReader(is,encoding);
-        try
-        {
-          // We read characters at a time, understanding the basic form of html.
-          // This code represents a basic bottom-up parser, which is the best thing since
we really don't want to code up all the context we'd need
-          // to do a top-down parse.  So, there is a parse state, and the code walks through
the document recognizing symbols and modifying the state.
+        if (Logging.connectors.isDebugEnabled())
+          Logging.connectors.debug("WEB: Document '"+documentURI+"' is text, with encoding
'"+encoding+"'; link extraction starting");
 
-          FormParseState currentParseState = new FormParseState(handler);
-          while (true)
-          {
-            int x = r.read();
-            if (x == -1)
-              break;
-            currentParseState.dealWithCharacter((char)x);
-          }
-          currentParseState.finishUp();
-        }
-        finally
-        {
-          r.close();
-        }
+        // Instantiate the parser, and call the right method
+        Parser p = new Parser();
+        p.parseWithoutCharsetDetection(encoding,is,new FormParseState(handler));
       }
       catch (UnsupportedEncodingException e)
       {



Mime
View raw message