incubator-connectors-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1064643 - in /incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler: IHTMLHandler.java IMetaTagHandler.java LinkParseState.java MetaParseState.java WebcrawlerConnector.java
Date Fri, 28 Jan 2011 12:47:11 GMT
Author: kwright
Date: Fri Jan 28 12:47:11 2011
New Revision: 1064643

URL: http://svn.apache.org/viewvc?rev=1064643&view=rev
Log:
Add support for meta robots tags.  CONNECTORS-153.

Added:
    incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IMetaTagHandler.java
  (with props)
    incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/MetaParseState.java
  (with props)
Modified:
    incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java
    incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
    incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java

Modified: incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java?rev=1064643&r1=1064642&r2=1064643&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java
(original)
+++ incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java
Fri Jan 28 12:47:11 2011
@@ -23,7 +23,7 @@ import java.util.*;
 
 /** This interface describes the functionality needed by an HTML processor in order to handle
an HTML document.
 */
-public interface IHTMLHandler extends IDiscoveredLinkHandler
+public interface IHTMLHandler extends IDiscoveredLinkHandler, IMetaTagHandler
 {
   /** Note the start of a form */
   public void noteFormStart(Map formAttributes)

Added: incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IMetaTagHandler.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IMetaTagHandler.java?rev=1064643&view=auto
==============================================================================
--- incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IMetaTagHandler.java
(added)
+++ incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IMetaTagHandler.java
Fri Jan 28 12:47:11 2011
@@ -0,0 +1,33 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import java.util.*;
+
+/** This interface describes the functionality needed by a parser to handle metadata tags.
+*/
+public interface IMetaTagHandler
+{
+  /** Inform the world of a discovered metadata tag.
+  *@param tagAttributes are the attributes that belong to the tag.
+  */
+  public void noteMetaTag(Map tagAttributes)
+    throws ManifoldCFException;
+}

Propchange: incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IMetaTagHandler.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IMetaTagHandler.java
------------------------------------------------------------------------------
    svn:keywords = Id

Modified: incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java?rev=1064643&r1=1064642&r2=1064643&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
(original)
+++ incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
Fri Jan 28 12:47:11 2011
@@ -22,14 +22,14 @@ import org.apache.manifoldcf.core.interf
 import java.util.*;
 
 /** This class recognizes and interprets all links */
-public class LinkParseState extends ScriptParseState
+public class LinkParseState extends MetaParseState
 {
 
   protected IHTMLHandler handler;
 
   public LinkParseState(IHTMLHandler handler)
   {
-    super();
+    super(handler);
     this.handler = handler;
   }
 
@@ -37,7 +37,6 @@ public class LinkParseState extends Scri
     throws ManifoldCFException
   {
     super.noteNonscriptTag(tagName,attributes);
-    String lowerTagName = tagName.toLowerCase();
     if (tagName.equals("a"))
     {
       String hrefValue = (String)attributes.get("href");

Added: incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/MetaParseState.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/MetaParseState.java?rev=1064643&view=auto
==============================================================================
--- incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/MetaParseState.java
(added)
+++ incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/MetaParseState.java
Fri Jan 28 12:47:11 2011
@@ -0,0 +1,45 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import java.util.*;
+
+/** This class recognizes and interprets all meta tags */
+public class MetaParseState extends ScriptParseState
+{
+  protected IMetaTagHandler handler;
+
+  public MetaParseState(IMetaTagHandler handler)
+  {
+    super();
+    this.handler = handler;
+  }
+
+  protected void noteNonscriptTag(String tagName, Map attributes)
+    throws ManifoldCFException
+  {
+    super.noteNonscriptTag(tagName,attributes);
+    if (tagName.equals("meta"))
+    {
+      handler.noteMetaTag(attributes);
+    }
+  }
+
+}

Propchange: incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/MetaParseState.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/MetaParseState.java
------------------------------------------------------------------------------
    svn:keywords = Id

Modified: incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1064643&r1=1064642&r2=1064643&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
(original)
+++ incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Fri Jan 28 12:47:11 2011
@@ -1127,12 +1127,18 @@ public class WebcrawlerConnector extends
         continue;
       }
 
+      // Now, extract links.
+      // We'll call the "link extractor" series, so we can plug more stuff in over time.
+      boolean indexDocument = extractLinks(documentIdentifier,activities,filter);
+
       // If scanOnly is set, we never ingest.  But all else is the same.
       if (!doScanOnly)
       {
         // Consider this document for ingestion.
         // We can exclude it if it does not seem to be a kind of document that the ingestion
system knows
         // about.
+        if (indexDocument)
+          indexDocument = isDataIngestable(activities,documentIdentifier);
 
         if (isDataIngestable(activities,documentIdentifier))
         {
@@ -1239,14 +1245,19 @@ public class WebcrawlerConnector extends
         }
         else
         {
+          // In case the indexability of the document changed, we still want to notify the
incremental indexer.
+          // We do this by using a null url and a null repository document.  If a document
with this identifier was
+          // previously indexed, it will be removed.
+          
+          // This is NOT quite the same as deleteDocument().  The deleteDocument() method
removes the record, and
+          // thus the version string.  So, when that is used, we cannot tell if the document
has changed; we simply have to try again.
+          activities.ingestDocument(documentIdentifier,version,null,null);
+          
           if (Logging.connectors.isDebugEnabled())
             Logging.connectors.debug("WEB: Decided not to ingest '"+documentIdentifier+"'
because it did not match ingestability criteria");
         }
       }
 
-      // Now, extract links.
-      // We'll call the "link extractor" series, so we can plug more stuff in over time.
-      extractLinks(documentIdentifier,activities,filter);
 
       i++;
     }
@@ -5242,6 +5253,12 @@ public class WebcrawlerConnector extends
       return discoveredFormData;
     }
 
+    /** Note a meta tag */
+    public void noteMetaTag(Map metaAttributes)
+      throws ManifoldCFException
+    {
+    }
+    
     /** Note the start of a form */
     public void noteFormStart(Map formAttributes)
       throws ManifoldCFException
@@ -5347,6 +5364,12 @@ public class WebcrawlerConnector extends
       this.preferredLinkPattern = preferredLinkPattern;
     }
 
+    /** Note a meta tag */
+    public void noteMetaTag(Map metaAttributes)
+      throws ManifoldCFException
+    {
+    }
+
     /** Note the start of a form */
     public void noteFormStart(Map formAttributes)
       throws ManifoldCFException
@@ -5509,14 +5532,18 @@ public class WebcrawlerConnector extends
 
 
   /** Code to extract links from an already-fetched document. */
-  protected void extractLinks(String documentIdentifier, IProcessActivity activities, DocumentURLFilter
filter)
+  protected boolean extractLinks(String documentIdentifier, IProcessActivity activities,
DocumentURLFilter filter)
     throws ManifoldCFException, ServiceInterruption
   {
-    handleRedirects(documentIdentifier,new ProcessActivityRedirectionHandler(documentIdentifier,activities,filter));
+    ProcessActivityRedirectionHandler redirectHandler = new ProcessActivityRedirectionHandler(documentIdentifier,activities,filter);
+    handleRedirects(documentIdentifier,redirectHandler);
     // For html, we don't want any actions, because we don't do form submission.
-    handleHTML(documentIdentifier,new ProcessActivityHTMLHandler(documentIdentifier,activities,filter));
-    handleXML(documentIdentifier,new ProcessActivityXMLHandler(documentIdentifier,activities,filter));
+    ProcessActivityHTMLHandler htmlHandler = new ProcessActivityHTMLHandler(documentIdentifier,activities,filter);
+    handleHTML(documentIdentifier,htmlHandler);
+    ProcessActivityXMLHandler xmlHandler = new ProcessActivityXMLHandler(documentIdentifier,activities,filter);
+    handleXML(documentIdentifier,xmlHandler);
     // May add more later for other extraction tasks.
+    return htmlHandler.shouldIndex() && redirectHandler.shouldIndex() &&
xmlHandler.shouldIndex();
   }
 
   /** This class is the handler for links that get added into a IProcessActivity object.
@@ -5569,17 +5596,74 @@ public class WebcrawlerConnector extends
     {
       super(documentIdentifier,activities,filter,"redirection",REL_REDIRECT);
     }
+    
+    public boolean shouldIndex()
+    {
+      return true;
+    }
+
   }
 
   /** Class that describes HTML handling */
   protected class ProcessActivityHTMLHandler extends ProcessActivityLinkHandler implements
IHTMLHandler
   {
+    boolean allowIndex = true;
+    boolean allowFollow = true;
+    
     /** Constructor. */
     public ProcessActivityHTMLHandler(String documentIdentifier, IProcessActivity activities,
DocumentURLFilter filter)
     {
       super(documentIdentifier,activities,filter,"html",REL_LINK);
     }
 
+    /** Decide whether we should index. */
+    public boolean shouldIndex()
+    {
+      return allowIndex;
+    }
+    
+    /** Note a meta tag */
+    public void noteMetaTag(Map metaAttributes)
+      throws ManifoldCFException
+    {
+      String name = (String)metaAttributes.get("name");
+      if (name != null && name.toLowerCase().equals("robots"))
+      {
+        String contentValue = (String)metaAttributes.get("content");
+        if (contentValue != null)
+        {
+          contentValue = contentValue.toLowerCase();
+          // Parse content value
+          try
+          {
+            String[] contentValues = contentValue.split("[, ]");
+            int i = 0;
+            while (i < contentValues.length)
+            {
+              String cv = contentValues[i++];
+              if (cv.equals("index"))
+                allowIndex = true;
+              else if (cv.equals("noindex"))
+                allowIndex = false;
+              else if (cv.equals("none"))
+              {
+                allowFollow = false;
+                allowIndex = false;
+              }
+              else if (cv.equals("follow"))
+                allowFollow = true;
+              else if (cv.equals("nofollow"))
+                allowFollow = false;
+            }
+          }
+          catch (PatternSyntaxException e)
+          {
+            throw new ManifoldCFException(e.getMessage(),e);
+          }
+        }
+      }
+    }
+
     /** Note the start of a form */
     public void noteFormStart(Map formAttributes)
       throws ManifoldCFException
@@ -5602,28 +5686,32 @@ public class WebcrawlerConnector extends
     public void noteAHREF(String rawURL)
       throws ManifoldCFException
     {
-      noteDiscoveredLink(rawURL);
+      if (allowFollow)
+        noteDiscoveredLink(rawURL);
     }
 
     /** Note discovered href */
     public void noteLINKHREF(String rawURL)
       throws ManifoldCFException
     {
-      noteDiscoveredLink(rawURL);
+      if (allowFollow)
+        noteDiscoveredLink(rawURL);
     }
 
     /** Note discovered IMG SRC */
     public void noteIMGSRC(String rawURL)
       throws ManifoldCFException
     {
-      noteDiscoveredLink(rawURL);
+      if (allowFollow)
+        noteDiscoveredLink(rawURL);
     }
 
     /** Note discovered FRAME SRC */
     public void noteFRAMESRC(String rawURL)
       throws ManifoldCFException
     {
-      noteDiscoveredLink(rawURL);
+      if (allowFollow)
+        noteDiscoveredLink(rawURL);
     }
 
   }
@@ -5637,6 +5725,11 @@ public class WebcrawlerConnector extends
       super(documentIdentifier,activities,filter,"xml",REL_LINK);
     }
 
+    public boolean shouldIndex()
+    {
+      return true;
+    }
+    
     /** Inform the world of a discovered ttl value.
     *@param rawTtlValue is the raw discovered ttl value.  Null indicates we should set the
default.
     */



Mime
View raw message