manifoldcf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1681736 - in /manifoldcf/branches/dev_1x: ./ connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/...
Date Tue, 26 May 2015 10:41:39 GMT
Author: kwright
Date: Tue May 26 10:41:38 2015
New Revision: 1681736

URL: http://svn.apache.org/r1681736
Log:
Pull up fix for CONNECTORS-1193 from trunk.

Added:
    manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/DocumentContentExclusionIT.java
      - copied unchanged from r1681735, manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/DocumentContentExclusionIT.java
Modified:
    manifoldcf/branches/dev_1x/   (props changed)
    manifoldcf/branches/dev_1x/CHANGES.txt
    manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
    manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java
    manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
    manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties
    manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties
    manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_zh_CN.properties
    manifoldcf/branches/dev_1x/framework/   (props changed)
    manifoldcf/branches/dev_1x/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputHistoryActivity.java
    manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IHistoryActivity.java

Propchange: manifoldcf/branches/dev_1x/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Tue May 26 10:41:38 2015
@@ -123,4 +123,4 @@
 /manifoldcf/branches/CONNECTORS-981:1605049-1605773
 /manifoldcf/branches/CONNECTORS-989:1611600-1612101
 /manifoldcf/branches/CONNECTORS-990:1610284-1610707
-/manifoldcf/trunk:1620703,1620748,1620812,1620862,1621449,1621613,1621855,1622213,1622740,1622850,1622853-1622854,1623249,1623251,1623314,1623599,1623951,1623953-1623954,1623956,1623972,1624058,1624085,1624174,1624236,1624377,1624384,1624399,1624449,1624464,1624504,1624729-1624731,1624906,1624909-1624910,1624982,1625023,1625095,1625103,1625108,1625264,1625270,1625394,1625400,1625910,1626090,1626097,1626102,1626638-1626639,1626973,1627687,1627690,1627959,1628046,1628066,1628106,1628168,1628188,1628699,1628798,1628808,1628845,1628905,1629122,1629374-1629375,1629379,1629541,1629994,1630188,1630535,1630623,1630671,1630812,1630885,1631039,1631162,1631164,1631252,1631750,1631953,1632013,1632225,1632289,1632562,1632844,1632847,1632854,1633062-1633063,1633108,1633193,1633202,1633282,1633284,1633295,1633336,1633339,1633345,1633348,1633364,1633378,1633383,1633432,1633546,1633590,1633634,1633668,1633727,1633760,1633764,1633786,1633910,1633923,1634021,1634028,1634067,1634132,1634145,1634148,163
 4155,1634188,1634202,1634264,1634373,1634530,1634688,1634850,1634857,1635103,1635116,1635421,1635438,1635478,1635481,1635484,1635490,1635809,1635939,1636146,1636167,1636180,1636207,1636215,1636232,1636334,1636519,1636570,1636684,1636940,1637011,1637310,1637350,1637364,1637373,1637378,1639259,1639593,1639600,1640018,1640101,1640199,1640204,1640208,1640314,1640319,1640749,1640772,1640805,1640888,1640925,1640941-1640942,1641222,1641328,1641557,1641559,1641629,1641633,1641724,1641754,1641911,1642163,1642255,1642318,1642531,1642650,1642658,1642673,1642716,1644197,1644399,1644538,1644920,1644931,1646317,1646397,1646403,1646408,1646640,1646947,1647574,1647585,1647608,1648686,1648976,1649201,1649203,1649529,1649605,1649628,1649794,1650351,1650722,1650741-1650742,1650745,1650747,1650911,1650954,1651332,1651539,1651907,1651921,1652071,1652974,1653175,1653899,1654651,1655205,1655261,1655264,1655377,1655411,1655618,1655914,1657346,1657443,1658004,1658036,1658121,1658155,1658188,1658463,1658476,
 1660258,1660276,1661454,1665848,1666160,1666781,1666820,1668312,1669100,1669238,1669487,1669523,1669586,1669660,1670614,1670625,1670715,1671496,1672169,1672301,1672616,1672737,1673559,1673573,1673579,1673722,1675781,1675898,1676094,1676882,1676910,1678300,1678329,1678471,1678551,1679730,1679826,1681390
+/manifoldcf/trunk:1620703,1620748,1620812,1620862,1621449,1621613,1621855,1622213,1622740,1622850,1622853-1622854,1623249,1623251,1623314,1623599,1623951,1623953-1623954,1623956,1623972,1624058,1624085,1624174,1624236,1624377,1624384,1624399,1624449,1624464,1624504,1624729-1624731,1624906,1624909-1624910,1624982,1625023,1625095,1625103,1625108,1625264,1625270,1625394,1625400,1625910,1626090,1626097,1626102,1626638-1626639,1626973,1627687,1627690,1627959,1628046,1628066,1628106,1628168,1628188,1628699,1628798,1628808,1628845,1628905,1629122,1629374-1629375,1629379,1629541,1629994,1630188,1630535,1630623,1630671,1630812,1630885,1631039,1631162,1631164,1631252,1631750,1631953,1632013,1632225,1632289,1632562,1632844,1632847,1632854,1633062-1633063,1633108,1633193,1633202,1633282,1633284,1633295,1633336,1633339,1633345,1633348,1633364,1633378,1633383,1633432,1633546,1633590,1633634,1633668,1633727,1633760,1633764,1633786,1633910,1633923,1634021,1634028,1634067,1634132,1634145,1634148,163
 4155,1634188,1634202,1634264,1634373,1634530,1634688,1634850,1634857,1635103,1635116,1635421,1635438,1635478,1635481,1635484,1635490,1635809,1635939,1636146,1636167,1636180,1636207,1636215,1636232,1636334,1636519,1636570,1636684,1636940,1637011,1637310,1637350,1637364,1637373,1637378,1639259,1639593,1639600,1640018,1640101,1640199,1640204,1640208,1640314,1640319,1640749,1640772,1640805,1640888,1640925,1640941-1640942,1641222,1641328,1641557,1641559,1641629,1641633,1641724,1641754,1641911,1642163,1642255,1642318,1642531,1642650,1642658,1642673,1642716,1644197,1644399,1644538,1644920,1644931,1646317,1646397,1646403,1646408,1646640,1646947,1647574,1647585,1647608,1648686,1648976,1649201,1649203,1649529,1649605,1649628,1649794,1650351,1650722,1650741-1650742,1650745,1650747,1650911,1650954,1651332,1651539,1651907,1651921,1652071,1652974,1653175,1653899,1654651,1655205,1655261,1655264,1655377,1655411,1655618,1655914,1657346,1657443,1658004,1658036,1658121,1658155,1658188,1658463,1658476,
 1660258,1660276,1661454,1665848,1666160,1666781,1666820,1668312,1669100,1669238,1669487,1669523,1669586,1669660,1670614,1670625,1670715,1671496,1672169,1672301,1672616,1672737,1673559,1673573,1673579,1673722,1675781,1675898,1676094,1676882,1676910,1678300,1678329,1678471,1678551,1679730,1679826,1681390,1681735

Modified: manifoldcf/branches/dev_1x/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/CHANGES.txt?rev=1681736&r1=1681735&r2=1681736&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/CHANGES.txt (original)
+++ manifoldcf/branches/dev_1x/CHANGES.txt Tue May 26 10:41:38 2015
@@ -3,6 +3,10 @@ $Id$
 
 ======================= 1.10-dev =====================
 
+CONNECTORS-1193: Add ability to discard web content based on a 
+set of regular expressions.
+(Arcadius Ahouansou)
+
 CONNECTORS-1199: SearchBlox connector formatting non-standard.
 (Karl Wright)
 

Modified: manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java?rev=1681736&r1=1681735&r2=1681736&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
(original)
+++ manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
Tue May 26 10:41:38 2015
@@ -18,15 +18,17 @@
 */
 package org.apache.manifoldcf.crawler.connectors.webcrawler;
 
-import org.apache.manifoldcf.core.interfaces.*;
-import org.apache.manifoldcf.crawler.system.Logging;
-import java.util.regex.*;
-import java.util.*;
+import org.apache.manifoldcf.core.interfaces.ManifoldCFException;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Pattern;
+import static java.util.Arrays.asList;
+import static org.apache.manifoldcf.crawler.system.Logging.connectors;
 
 /** This class is the handler for HTML content grepping during state transitions */
 public class FindContentHandler extends FindHandler implements IHTMLHandler
 {
-  protected final Pattern contentPattern;
+  protected final List<Pattern> contentPatterns;
   protected final StringBuilder contentBuffer = new StringBuilder();
 
   protected final static int MAX_LENGTH = 65536;
@@ -35,7 +37,13 @@ public class FindContentHandler extends
   public FindContentHandler(String parentURI, Pattern contentPattern)
   {
     super(parentURI);
-    this.contentPattern = contentPattern;
+    this.contentPatterns = asList(contentPattern);
+  }
+
+  public FindContentHandler(String parentURI, List<Pattern> contentPatterns)
+  {
+    super(parentURI);
+    this.contentPatterns = contentPatterns;
   }
 
   /** Apply overrides */
@@ -69,10 +77,14 @@ public class FindContentHandler extends
         // continuity
         String bufferContents = contentBuffer.toString();
         contentBuffer.setLength(0);
-        if (contentPattern.matcher(bufferContents).find())
-          targetURI = "";
-        else
-        {
+        for (Pattern contentPattern : contentPatterns) {
+          if (contentPattern.matcher(bufferContents).find()) {
+            targetURI = "";
+            break;
+          }
+        }
+
+        if(targetURI == null) {
           contentBuffer.append(bufferContents.substring(bufferContents.length() - OVERLAP_AMOUNT));
         }
       }
@@ -153,8 +165,12 @@ public class FindContentHandler extends
   {
     String bufferContents = contentBuffer.toString();
     contentBuffer.setLength(0);
-    if (contentPattern.matcher(bufferContents).find())
-      targetURI = "";
+    for(Pattern contentPattern: contentPatterns) {
+      if (contentPattern.matcher(bufferContents).find()) {
+        targetURI = "";
+        return;
+      }
+    }
   }
 
 }

Modified: manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java?rev=1681736&r1=1681735&r2=1681736&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java
(original)
+++ manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java
Tue May 26 10:41:38 2015
@@ -137,6 +137,12 @@ public class WebcrawlerConfig
   * if any one matches, causes the URL to be excluded from indexing.  These
   * regexps are newline separated, and # starts a comment.  */
   public static final String NODE_EXCLUDESINDEX = "excludesindex";
+
+  /**
+   * Exclude any page containing specified regex in their body from index
+   */
+  public static final String NODE_EXCLUDESCONTENTINDEX = "excludescontentindex";
+
   /** Limit to seeds.  When value attribute is true, only seed domains will be permitted.
*/
   public static final String NODE_LIMITTOSEEDS = "limittoseeds";
   /** Canonicalization rule.  Attributes are regexp, description, reorder, 

Modified: manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1681736&r1=1681735&r2=1681736&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
(original)
+++ manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Tue May 26 10:41:38 2015
@@ -28,10 +28,6 @@ import org.apache.manifoldcf.ui.util.Enc
 import org.apache.manifoldcf.core.fuzzyml.*;
 
 import org.apache.http.conn.ConnectTimeoutException;
-import org.apache.http.client.RedirectException;
-import org.apache.http.client.CircularRedirectException;
-import org.apache.http.NoHttpResponseException;
-import org.apache.http.HttpException;
 
 import java.io.*;
 import java.nio.charset.StandardCharsets;
@@ -507,7 +503,7 @@ public class WebcrawlerConnector extends
   *@param activities is the interface this method should use to perform whatever framework
actions are desired.
   *@param spec is a document specification (that comes from the job).
   *@param seedTime is the end of the time range of documents to consider, exclusive.
-  *@param lastSeedVersionString is the last seeding version string for this job, or null
if the job has no previous seeding version string.
+  *@param lastSeedVersion is the last seeding version string for this job, or null if the
job has no previous seeding version string.
   *@param jobMode is an integer describing how the job is being run, whether continuous or
once-only.
   *@return an updated seeding version string, to be stored with the job.
   */
@@ -1335,7 +1331,7 @@ public class WebcrawlerConnector extends
         activities.noDocument(documentIdentifier,versionString);
         return;
       }
-      
+
       if (activities.checkURLIndexable(documentIdentifier) == false)
       {
         if (Logging.connectors.isDebugEnabled())
@@ -1387,7 +1383,15 @@ public class WebcrawlerConnector extends
         activities.noDocument(documentIdentifier,versionString);
         return;
       }
-      
+
+      if(!filter.isDocumentContentIndexable(documentIdentifier)){
+        if (Logging.connectors.isDebugEnabled())
+          Logging.connectors.debug("Web: For document '"+documentIdentifier+"', not indexing
because document content matched document content exclusion rule");
+        errorCode = activities.EXCLUDED_CONTENT;
+        errorDesc = "Rejected due to content exclusion rule";
+        activities.noDocument(documentIdentifier,versionString);
+        return;
+      }
       // Ingest the document
       if (Logging.connectors.isDebugEnabled())
         Logging.connectors.debug("WEB: Decided to ingest '"+documentIdentifier+"'");
@@ -3895,6 +3899,8 @@ public class WebcrawlerConnector extends
     String exclusions = "";
     String inclusionsIndex = ".*\n";
     String exclusionsIndex = "";
+    String exclusionsContentIndex = "";
+    
     boolean includeMatching = true;
     Set<String> excludedHeaders = new HashSet<String>();
     
@@ -3945,6 +3951,12 @@ public class WebcrawlerConnector extends
         if (exclusionsIndex == null)
           exclusionsIndex = "";
       }
+      else if (sn.getType().equals(WebcrawlerConfig.NODE_EXCLUDESCONTENTINDEX))
+      {
+        exclusionsContentIndex = sn.getValue();
+        if (exclusionsContentIndex == null)
+        	exclusionsContentIndex = "";
+      }
       else if (sn.getType().equals(WebcrawlerConfig.NODE_LIMITTOSEEDS))
       {
         String value = sn.getAttributeValue(WebcrawlerConfig.ATTR_VALUE);
@@ -4302,14 +4314,21 @@ public class WebcrawlerConnector extends
 "      <textarea rows=\"10\" cols=\"60\" name=\""+seqPrefix+"exclusionsindex\">"+Encoder.bodyEscape(exclusionsIndex)+"</textarea>\n"+
 "    </td>\n"+
 "  </tr>\n"+
+"  <tr>\n"+
+"    <td class=\"description\" colspan=\"1\"><nobr>" + Messages.getBodyString(locale,"WebcrawlerConnector.ExcludeContentFromIndex")
+ "</nobr></td>\n"+
+"    <td class=\"value\" colspan=\"1\">\n"+
+"      <textarea rows=\"10\" cols=\"60\" name=\""+seqPrefix+"exclusionscontentindex\">"+Encoder.bodyEscape(exclusionsContentIndex)+"</textarea>\n"+
+"    </td>\n"+
+"  </tr>\n"+
 "</table>\n"
       );
     }
     else
     {
       out.print(
-"<input type=\"hidden\" name=\""+seqPrefix+"exclusions\" value=\""+Encoder.attributeEscape(exclusions)+"\"/>\n"+
-"<input type=\"hidden\" name=\""+seqPrefix+"exclusionsindex\" value=\""+Encoder.attributeEscape(exclusionsIndex)+"\"/>\n"
+              "<input type=\"hidden\" name=\"" + seqPrefix + "exclusions\" value=\"" +
Encoder.attributeEscape(exclusions) + "\"/>\n" +
+                      "<input type=\"hidden\" name=\"" + seqPrefix + "exclusionsindex\"
value=\"" + Encoder.attributeEscape(exclusionsIndex) + "\"/>\n" +
+                      "<input type=\"hidden\" name=\"" + seqPrefix + "exclusionscontentindex\"
value=\"" + Encoder.attributeEscape(exclusionsContentIndex) + "\"/>\n"
       );
     }
   
@@ -4756,6 +4775,26 @@ public class WebcrawlerConnector extends
       ds.addChild(ds.getChildCount(),cn);
     }
 
+    // Get the content index exclusions
+    String exclusionsContentIndex = variableContext.getParameter(seqPrefix+"exclusionscontentindex");
+    if (exclusionsContentIndex != null)
+    {
+      // Delete existing content exclusions record first
+      int i = 0;
+      while (i < ds.getChildCount())
+      {
+        SpecificationNode sn = ds.getChild(i);
+        if (sn.getType().equals(WebcrawlerConfig.NODE_EXCLUDESCONTENTINDEX))
+          ds.removeChild(i);
+        else
+          i++;
+      }
+
+      SpecificationNode cn = new SpecificationNode(WebcrawlerConfig.NODE_EXCLUDESCONTENTINDEX);
+      cn.setValue(exclusionsContentIndex);
+      ds.addChild(ds.getChildCount(),cn);
+    }
+
     // Read the url specs
     String urlRegexpCount = variableContext.getParameter(seqPrefix+"urlregexpcount");
     if (urlRegexpCount != null && urlRegexpCount.length() > 0)
@@ -4962,6 +5001,8 @@ public class WebcrawlerConnector extends
     String exclusions = "";
     String inclusionsIndex = ".*\n";
     String exclusionsIndex = "";
+    String exclusionsContentIndex = "";
+
     boolean includeMatching = false;
     Set<String> excludedHeaders = new HashSet<String>();
     
@@ -4999,6 +5040,12 @@ public class WebcrawlerConnector extends
         if (exclusionsIndex == null)
           exclusionsIndex = "";
       }
+      else if (sn.getType().equals(WebcrawlerConfig.NODE_EXCLUDESCONTENTINDEX))
+      {
+        exclusionsContentIndex = sn.getValue();
+        if (exclusionsContentIndex == null)
+        	exclusionsContentIndex = "";
+      }
       else if (sn.getType().equals(WebcrawlerConfig.NODE_LIMITTOSEEDS))
       {
         String value = sn.getAttributeValue(WebcrawlerConfig.ATTR_VALUE);
@@ -5377,6 +5424,48 @@ public class WebcrawlerConnector extends
     out.print(
 "    </td>\n"+
 "  </tr>\n"+
+"  <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+
+"  <tr>\n"+
+"    <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"WebcrawlerConnector.ExcludeContentFromIndex")
+ "</nobr></td>\n"+
+"    <td class=\"value\">\n"
+    );
+    try
+    {
+      java.io.Reader str = new java.io.StringReader(exclusionsContentIndex);
+      try
+      {
+        java.io.BufferedReader is = new java.io.BufferedReader(str);
+        try
+        {
+          while (true)
+          {
+            String nextString = is.readLine();
+            if (nextString == null)
+              break;
+            if (nextString.length() == 0)
+              continue;
+            out.print(
+"      <nobr>"+Encoder.bodyEscape(nextString)+"</nobr><br/>\n"
+            );
+          }
+        }
+        finally
+        {
+          is.close();
+        }
+      }
+      finally
+      {
+        str.close();
+      }
+    }
+    catch (java.io.IOException e)
+    {
+      throw new ManifoldCFException("IO error: "+e.getMessage(),e);
+    }
+    out.print(
+"    </td>\n"+
+"  </tr>\n"+
 "    \n"+
 "  <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"
     );
@@ -6197,7 +6286,7 @@ public class WebcrawlerConnector extends
     handler.applyOverrides(lp);
     return handler.getTargetURI();
   }
-  
+
   /** Find HTML link URI, if present, making sure specified preference is matched. */
   protected String findHTMLLinkURI(String currentURI, LoginParameters lp)
     throws ManifoldCFException
@@ -8013,7 +8102,7 @@ public class WebcrawlerConnector extends
 
   /** This class describes the url filtering information (for crawling and indexing) obtained
from a digested DocumentSpecification.
   */
-  protected static class DocumentURLFilter
+  protected class DocumentURLFilter
   {
     /** The version string */
     protected String versionString;
@@ -8029,7 +8118,10 @@ public class WebcrawlerConnector extends
     protected final List<Pattern> excludeIndexPatterns = new ArrayList<Pattern>();
     /** The hash map of seed hosts, to limit urls by, if non-null */
     protected Set<String> seedHosts = null;
-    
+
+    /**List of content exclusion pattern*/
+    protected final List<Pattern> excludeContentIndexPatterns = new ArrayList<Pattern>();
+
     /** Canonicalization policies */
     protected final CanonicalizationPolicies canonicalizationPolicies = new CanonicalizationPolicies();
 
@@ -8045,6 +8137,7 @@ public class WebcrawlerConnector extends
       String excludes = "";
       String includesIndex = ".*";
       String excludesIndex = "";
+      String excludesContentIndex = "";
       String seeds = "";
       List<String> packList = new ArrayList<String>();
       String[] packStuff = new String[2];
@@ -8176,12 +8269,19 @@ public class WebcrawlerConnector extends
             throw new ManifoldCFException("Canonicalization regular expression '"+urlRegexp+"'
is illegal: "+e.getMessage(),e);
           }
         }
+        else if (sn.getType().equals(WebcrawlerConfig.NODE_EXCLUDESCONTENTINDEX))
+        {
+          excludesContentIndex = sn.getValue();
+          if (excludesContentIndex == null)
+            excludesContentIndex = "";
+        }
       }
 
       // Note: format change since MCF 1.7 release
       StringBuilder versionBuffer = new StringBuilder();
       pack(versionBuffer,includesIndex,'+');
       pack(versionBuffer,excludesIndex,'+');
+      pack(versionBuffer,excludesContentIndex,'+');
       packList(versionBuffer,packList,'+');
       versionString = versionBuffer.toString();
       
@@ -8194,7 +8294,9 @@ public class WebcrawlerConnector extends
       compileList(includeIndexPatterns,list);
       list = stringToArray(excludesIndex);
       compileList(excludeIndexPatterns,list);
-      
+      list = stringToArray(excludesContentIndex);
+      compileList(excludeContentIndexPatterns,list);
+
       if (limitToSeeds)
       {
         seedHosts = new HashSet<String>();
@@ -8365,6 +8467,30 @@ public class WebcrawlerConnector extends
       return canonicalizationPolicies;
     }
 
+    public boolean isDocumentContentIndexable(String documentIdentifier) throws ManifoldCFException
{
+        String content = findSpecifiedContent(documentIdentifier, excludeContentIndexPatterns);
+        if (content != null) {
+          if (Logging.connectors.isDebugEnabled())
+            Logging.connectors.debug("WEB: Url '" + documentIdentifier + "' is not indexable
because content exclusion pattern was matched");
+
+          return false;
+      }
+      return true;
+    }
+
+    protected String findSpecifiedContent(String currentURI, List<Pattern> patterns)
throws ManifoldCFException
+    {
+      if (excludeContentIndexPatterns.isEmpty()) {
+        if (Logging.connectors.isDebugEnabled())
+          Logging.connectors.debug("WEB: no content exclusion rule supplied... returning");
+        return null;
+      }
+
+      FindContentHandler handler = new FindContentHandler(currentURI, patterns);
+      handleHTML(currentURI, handler);
+      return handler.getTargetURI();
+    }
+
   }
 
   protected static class FetchStatus

Modified: manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties?rev=1681736&r1=1681735&r2=1681736&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties
(original)
+++ manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties
Tue May 26 10:41:38 2015
@@ -76,6 +76,7 @@ WebcrawlerConnector.IncludeInIndex=Inclu
 WebcrawlerConnector.IncludeOnlyHostsMatchingSeeds=Include only hosts matching seeds?
 WebcrawlerConnector.ExcludeFromCrawl=Exclude from crawl:
 WebcrawlerConnector.ExcludeFromIndex=Exclude from index:
+WebcrawlerConnector.ExcludeContentFromIndex=Exclude content from index:
 WebcrawlerConnector.DeleteToken=Delete token #
 WebcrawlerConnector.NoAccessTokensPresent=No access tokens present
 WebcrawlerConnector.AddAccessToken=Add access token

Modified: manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties?rev=1681736&r1=1681735&r2=1681736&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties
(original)
+++ manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties
Tue May 26 10:41:38 2015
@@ -77,6 +77,7 @@ WebcrawlerConnector.IncludeOnlyHostsMatc
 WebcrawlerConnector.ExcludeFromCrawl=クロールから除外:
 WebcrawlerConnector.ExcludeFromIndex=索引が除外:
 WebcrawlerConnector.DeleteToken=トークンを削除 #
+WebcrawlerConnector.ExcludeContentFromIndex=Exclude content from index:
 WebcrawlerConnector.NoAccessTokensPresent=アクセストークンがありません
 WebcrawlerConnector.AddAccessToken=アクセストークンを追加
 WebcrawlerConnector.DeleteMetadata=メタデータを削除 #

Modified: manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_zh_CN.properties
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_zh_CN.properties?rev=1681736&r1=1681735&r2=1681736&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_zh_CN.properties
(original)
+++ manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_zh_CN.properties
Tue May 26 10:41:38 2015
@@ -77,6 +77,7 @@ WebcrawlerConnector.IncludeOnlyHostsMatc
 WebcrawlerConnector.ExcludeFromCrawl=排除于爬虫外: 
 WebcrawlerConnector.ExcludeFromIndex=排除于索引外: 
 WebcrawlerConnector.DeleteToken=删除令牌 #
+WebcrawlerConnector.ExcludeContentFromIndex=Exclude content from index:
 WebcrawlerConnector.NoAccessTokensPresent=访问令牌不存在
 WebcrawlerConnector.AddAccessToken=添加访问令牌
 WebcrawlerConnector.DeleteMetadata=删除元数据 #

Propchange: manifoldcf/branches/dev_1x/framework/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Tue May 26 10:41:38 2015
@@ -113,4 +113,4 @@
 /manifoldcf/branches/CONNECTORS-989/framework:1611600-1612101
 /manifoldcf/branches/CONNECTORS-990/framework:1610284-1610707
 /manifoldcf/trunk:1629122
-/manifoldcf/trunk/framework:1620703,1620748,1620812,1620862,1621449,1621613,1621855,1622213,1622740,1622850,1622853-1622854,1623249,1623251,1623314,1623599,1623951,1623953-1623954,1623956,1623972,1624058,1624085,1624174,1624236,1624377,1624384,1624399,1624449,1624464,1624504,1624729-1624731,1624906,1624909-1624910,1624982,1625023,1625095,1625103,1625108,1625264,1625270,1625394,1625400,1625910,1626090,1626097,1626102,1626638-1626639,1626973,1627687,1627690,1627959,1628046,1628066,1628106,1628168,1628188,1628699,1628798,1628808,1628845,1628905,1629122,1629374-1629375,1629379,1629541,1629994,1630188,1630535,1630623,1630671,1630812,1630885,1631039,1631162,1631164,1631252,1632013,1632289,1632844,1633108,1633193,1633202,1633348,1633364,1634145,1634148,1634155,1634264,1634373,1634530,1635438,1635809,1636146,1636180,1636207,1636232,1636334,1636519,1636570,1636684,1636940,1637011,1639593,1639600,1640018,1640101,1640199,1640314,1640319,1640749,1640772,1640925,1640941,1641222,1641557,1641559,1
 641724,1641911,1642163,1642255,1642318,1644197,1644399,1646317,1646397,1646403,1646640,1647574,1647585,1647608,1649605,1650351,1650911,1651332,1651539,1651921,1655377,1655411,1657346,1658004,1658036,1660258,1660276,1669487,1670614,1672616,1672737,1676094,1681390
+/manifoldcf/trunk/framework:1620703,1620748,1620812,1620862,1621449,1621613,1621855,1622213,1622740,1622850,1622853-1622854,1623249,1623251,1623314,1623599,1623951,1623953-1623954,1623956,1623972,1624058,1624085,1624174,1624236,1624377,1624384,1624399,1624449,1624464,1624504,1624729-1624731,1624906,1624909-1624910,1624982,1625023,1625095,1625103,1625108,1625264,1625270,1625394,1625400,1625910,1626090,1626097,1626102,1626638-1626639,1626973,1627687,1627690,1627959,1628046,1628066,1628106,1628168,1628188,1628699,1628798,1628808,1628845,1628905,1629122,1629374-1629375,1629379,1629541,1629994,1630188,1630535,1630623,1630671,1630812,1630885,1631039,1631162,1631164,1631252,1632013,1632289,1632844,1633108,1633193,1633202,1633348,1633364,1634145,1634148,1634155,1634264,1634373,1634530,1635438,1635809,1636146,1636180,1636207,1636232,1636334,1636519,1636570,1636684,1636940,1637011,1639593,1639600,1640018,1640101,1640199,1640314,1640319,1640749,1640772,1640925,1640941,1641222,1641557,1641559,1
 641724,1641911,1642163,1642255,1642318,1644197,1644399,1646317,1646397,1646403,1646640,1647574,1647585,1647608,1649605,1650351,1650911,1651332,1651539,1651921,1655377,1655411,1657346,1658004,1658036,1660258,1660276,1669487,1670614,1672616,1672737,1676094,1681390,1681735

Modified: manifoldcf/branches/dev_1x/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputHistoryActivity.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputHistoryActivity.java?rev=1681736&r1=1681735&r2=1681736&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputHistoryActivity.java
(original)
+++ manifoldcf/branches/dev_1x/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputHistoryActivity.java
Tue May 26 10:41:38 2015
@@ -35,6 +35,8 @@ public interface IOutputHistoryActivity
   public static final String EXCLUDED_LENGTH = "EXCLUDEDLENGTH";
   public static final String EXCLUDED_MIMETYPE = "EXCLUDEDMIMETYPE";
   public static final String EXCLUDED_DATE = "EXCLUDEDDATE";
+  public static final String EXCLUDED_CONTENT = "EXCLUDEDCONTENT";
+
   /**
    * Use this result code when security info is not recognized. 
    */

Modified: manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IHistoryActivity.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IHistoryActivity.java?rev=1681736&r1=1681735&r2=1681736&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IHistoryActivity.java
(original)
+++ manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IHistoryActivity.java
Tue May 26 10:41:38 2015
@@ -32,6 +32,8 @@ public interface IHistoryActivity
   public static final String EXCLUDED_LENGTH = IOutputHistoryActivity.EXCLUDED_LENGTH;
   public static final String EXCLUDED_MIMETYPE = IOutputHistoryActivity.EXCLUDED_MIMETYPE;
   public static final String EXCLUDED_DATE = IOutputHistoryActivity.EXCLUDED_DATE;
+  public static final String EXCLUDED_CONTENT = IOutputHistoryActivity.EXCLUDED_CONTENT;
+
   /**
    * Use this result code when you get URL value from repository and it is not valid.
    */



Mime
View raw message