manifoldcf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1492767 - /manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Date Thu, 13 Jun 2013 17:11:23 GMT
Author: kwright
Date: Thu Jun 13 17:11:23 2013
New Revision: 1492767

URL: http://svn.apache.org/r1492767
Log:
Add more logging for indexing prohibition.  Part of CONNECTORS-715.

Modified:
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1492767&r1=1492766&r2=1492767&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
(original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Thu Jun 13 17:11:23 2013
@@ -5738,11 +5738,17 @@ public class WebcrawlerConnector extends
   {
     ProcessActivityRedirectionHandler redirectHandler = new ProcessActivityRedirectionHandler(documentIdentifier,activities,filter);
     handleRedirects(documentIdentifier,redirectHandler);
+    if (Logging.connectors.isDebugEnabled() && redirectHandler.shouldIndex() == false)
+      Logging.connectors.debug("Web: Not indexing document '"+documentIdentifier+"' because
of redirection");
     // For html, we don't want any actions, because we don't do form submission.
     ProcessActivityHTMLHandler htmlHandler = new ProcessActivityHTMLHandler(documentIdentifier,activities,filter);
     handleHTML(documentIdentifier,htmlHandler);
+    if (Logging.connectors.isDebugEnabled() && htmlHandler.shouldIndex() == false)
+      Logging.connectors.debug("Web: Not indexing document '"+documentIdentifier+"' because
of HTML robots or content tags prohibiting indexing");
     ProcessActivityXMLHandler xmlHandler = new ProcessActivityXMLHandler(documentIdentifier,activities,filter);
     handleXML(documentIdentifier,xmlHandler);
+    if (Logging.connectors.isDebugEnabled() && xmlHandler.shouldIndex() == false)
+      Logging.connectors.debug("Web: Not indexing document '"+documentIdentifier+"' because
of XML robots or content tags prohibiting indexing");
     // May add more later for other extraction tasks.
     return htmlHandler.shouldIndex() && redirectHandler.shouldIndex() &&
xmlHandler.shouldIndex();
   }



Mime
View raw message