Return-Path: X-Original-To: apmail-incubator-connectors-commits-archive@minotaur.apache.org Delivered-To: apmail-incubator-connectors-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 863DE7DB7 for ; Sun, 24 Jul 2011 22:23:49 +0000 (UTC) Received: (qmail 38018 invoked by uid 500); 24 Jul 2011 22:23:49 -0000 Delivered-To: apmail-incubator-connectors-commits-archive@incubator.apache.org Received: (qmail 37975 invoked by uid 500); 24 Jul 2011 22:23:48 -0000 Mailing-List: contact connectors-commits-help@incubator.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: connectors-dev@incubator.apache.org Delivered-To: mailing list connectors-commits@incubator.apache.org Received: (qmail 37968 invoked by uid 99); 24 Jul 2011 22:23:48 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Sun, 24 Jul 2011 22:23:48 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Sun, 24 Jul 2011 22:23:46 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 526A823888CB; Sun, 24 Jul 2011 22:23:26 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: svn commit: r1150505 - in /incubator/lcf/trunk: ./ connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ Date: Sun, 24 Jul 2011 22:23:26 -0000 To: connectors-commits@incubator.apache.org From: kwright@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20110724222326.526A823888CB@eris.apache.org> Author: kwright Date: Sun Jul 24 22:23:25 2011 New Revision: 1150505 URL: http://svn.apache.org/viewvc?rev=1150505&view=rev Log: Finish CONNECTORS-214. Add post-fetch filtering to the Inclusions and Exclusions tab of the web connector. Modified: incubator/lcf/trunk/CHANGES.txt incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Modified: incubator/lcf/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/CHANGES.txt?rev=1150505&r1=1150504&r2=1150505&view=diff ============================================================================== --- incubator/lcf/trunk/CHANGES.txt (original) +++ incubator/lcf/trunk/CHANGES.txt Sun Jul 24 22:23:25 2011 @@ -3,13 +3,17 @@ $Id$ ======================= 0.3-dev ========================= +CONNECTORS-214: Add post-fetch regular expression filtering to Web +connector. +(Karl Wright) + CONNECTORS-225: Derby throws deadlock exceptions when indexing rapidly. (Karl Wright) CONNECTORS-223: Move test classes to be compatible with maven conventions. -(Karl Wright) +(Tobias Rübner, Karl Wright) CONNECTORS-219: Update maven pom.xml files to include proper dependencies and version numbers. Modified: incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java?rev=1150505&r1=1150504&r2=1150505&view=diff ============================================================================== --- incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java (original) +++ incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java Sun Jul 24 22:23:25 2011 @@ -106,13 +106,21 @@ public class WebcrawlerConfig * text area. */ public static final String NODE_SEEDS = "seeds"; /** Include regexps node. The value of this node contains the regexps that - * must match the canonical URL in order for that URL to be included. These + * must match the canonical URL in order for that URL to be included in the crawl. These * regexps are newline separated, and # starts a comment. */ public static final String NODE_INCLUDES = "includes"; /** Exclude regexps node. The value of this node contains the regexps that - * if any one matches, causes the URL to be excluded. These + * if any one matches, causes the URL to be excluded from the crawl. These * regexps are newline separated, and # starts a comment. */ public static final String NODE_EXCLUDES = "excludes"; + /** Include regexps node. The value of this node contains the regexps that + * must match the canonical URL in order for that URL to be included for indexing. These + * regexps are newline separated, and # starts a comment. */ + public static final String NODE_INCLUDESINDEX = "includesindex"; + /** Exclude regexps node. The value of this node contains the regexps that + * if any one matches, causes the URL to be excluded from indexing. These + * regexps are newline separated, and # starts a comment. */ + public static final String NODE_EXCLUDESINDEX = "excludesindex"; /** Limit to seeds. When value attribute is true, only seed domains will be permitted. */ public static final String NODE_LIMITTOSEEDS = "limittoseeds"; Modified: incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1150505&r1=1150504&r2=1150505&view=diff ============================================================================== --- incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original) +++ incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Sun Jul 24 22:23:25 2011 @@ -549,6 +549,8 @@ public class WebcrawlerConnector extends // an object that knows how to do this. DocumentURLFilter filter = new DocumentURLFilter(spec); + String filterVersion = filter.getVersionString(); + String[] rval = new String[documentIdentifiers.length]; long currentTime = System.currentTimeMillis(); @@ -1054,6 +1056,9 @@ public class WebcrawlerConnector extends packList(sb,metadata,'+'); // Done with the parseable part! Add the checksum. sb.append(checkSum); + // Add the filter version + sb.append("+"); + sb.append(filterVersion); rval[i] = sb.toString(); break; case RESULT_RETRY_DOCUMENT: @@ -1139,7 +1144,7 @@ public class WebcrawlerConnector extends // We can exclude it if it does not seem to be a kind of document that the ingestion system knows // about. if (indexDocument) - indexDocument = isDataIngestable(activities,documentIdentifier); + indexDocument = isDataIngestable(activities,documentIdentifier,filter); if (indexDocument) { @@ -3400,6 +3405,8 @@ public class WebcrawlerConnector extends String seeds = ""; String inclusions = ".*\n"; String exclusions = ""; + String inclusionsIndex = ".*\n"; + String exclusionsIndex = ""; boolean includeMatching = false; // Now, loop through description @@ -3425,6 +3432,18 @@ public class WebcrawlerConnector extends if (exclusions == null) exclusions = ""; } + else if (sn.getType().equals(org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConfig.NODE_INCLUDESINDEX)) + { + inclusionsIndex = sn.getValue(); + if (inclusionsIndex == null) + inclusionsIndex = ""; + } + else if (sn.getType().equals(org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConfig.NODE_EXCLUDESINDEX)) + { + exclusionsIndex = sn.getValue(); + if (exclusionsIndex == null) + exclusionsIndex = ""; + } else if (sn.getType().equals(org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConfig.NODE_LIMITTOSEEDS)) { String value = sn.getAttributeValue(WebcrawlerConfig.ATTR_VALUE); @@ -3619,8 +3638,15 @@ public class WebcrawlerConnector extends "\n"+ " \n"+ " \n"+ -" \n"+ +" \n"+ +" \n"+ +" \n"+ +" \n"+ +" \n"+ " \n"+ " \n"+ @@ -3636,6 +3662,7 @@ public class WebcrawlerConnector extends { out.print( "\n"+ +"\n"+ "\n"+ "\n" ); @@ -3649,8 +3676,15 @@ public class WebcrawlerConnector extends "

\n"+ -" \n"+ +" Include in crawl:\n"+ +" \n"+ +"
Include in index:\n"+ +" \n"+ "
\n"+ " \n"+ " \n"+ -" \n"+ +" \n"+ +" \n"+ +" \n"+ +" \n"+ +" \n"+ " \n"+ "

\n"+ -" \n"+ +" Exclude from crawl:\n"+ +" \n"+ +"
Exclude from index:\n"+ +" \n"+ "
\n" @@ -3659,7 +3693,8 @@ public class WebcrawlerConnector extends else { out.print( -"\n" +"\n"+ +"\n" ); } @@ -3900,6 +3935,26 @@ public class WebcrawlerConnector extends ds.addChild(ds.getChildCount(),cn); } + // Get the index inclusions + String inclusionsIndex = variableContext.getParameter("inclusionsindex"); + if (inclusionsIndex != null) + { + // Delete existing index inclusions record first + int i = 0; + while (i < ds.getChildCount()) + { + SpecificationNode sn = ds.getChild(i); + if (sn.getType().equals(org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConfig.NODE_INCLUDESINDEX)) + ds.removeChild(i); + else + i++; + } + + SpecificationNode cn = new SpecificationNode(org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConfig.NODE_INCLUDESINDEX); + cn.setValue(inclusionsIndex); + ds.addChild(ds.getChildCount(),cn); + } + // Handle the seeds-only switch String matchingHostsPresent = variableContext.getParameter("matchinghosts_present"); if (matchingHostsPresent != null) @@ -3941,6 +3996,26 @@ public class WebcrawlerConnector extends ds.addChild(ds.getChildCount(),cn); } + // Get the index exclusions + String exclusionsIndex = variableContext.getParameter("exclusionsindex"); + if (exclusionsIndex != null) + { + // Delete existing exclusions record first + int i = 0; + while (i < ds.getChildCount()) + { + SpecificationNode sn = ds.getChild(i); + if (sn.getType().equals(org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConfig.NODE_EXCLUDESINDEX)) + ds.removeChild(i); + else + i++; + } + + SpecificationNode cn = new SpecificationNode(org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConfig.NODE_EXCLUDESINDEX); + cn.setValue(exclusionsIndex); + ds.addChild(ds.getChildCount(),cn); + } + // Read the url specs String urlRegexpCount = variableContext.getParameter("urlregexpcount"); if (urlRegexpCount != null && urlRegexpCount.length() > 0) @@ -4140,6 +4215,8 @@ public class WebcrawlerConnector extends String seeds = ""; String inclusions = ".*\n"; String exclusions = ""; + String inclusionsIndex = ".*\n"; + String exclusionsIndex = ""; boolean includeMatching = false; int i = 0; @@ -4164,6 +4241,18 @@ public class WebcrawlerConnector extends if (exclusions == null) exclusions = ""; } + else if (sn.getType().equals(org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConfig.NODE_INCLUDESINDEX)) + { + inclusionsIndex = sn.getValue(); + if (inclusionsIndex == null) + inclusionsIndex = ""; + } + else if (sn.getType().equals(org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConfig.NODE_EXCLUDESINDEX)) + { + exclusionsIndex = sn.getValue(); + if (exclusionsIndex == null) + exclusionsIndex = ""; + } else if (sn.getType().equals(org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConfig.NODE_LIMITTOSEEDS)) { String value = sn.getAttributeValue(WebcrawlerConfig.ATTR_VALUE); @@ -4304,7 +4393,7 @@ public class WebcrawlerConnector extends out.print( "
\n"+ " \n"+ -" Includes:\n"+ +" Include in crawl:\n"+ " \n" ); try @@ -4346,7 +4435,49 @@ public class WebcrawlerConnector extends " \n"+ "
\n"+ " \n"+ -" Excludes:\n"+ +" Include in index:\n"+ +" \n" + ); + try + { + java.io.Reader str = new java.io.StringReader(inclusionsIndex); + try + { + java.io.BufferedReader is = new java.io.BufferedReader(str); + try + { + while (true) + { + String nextString = is.readLine(); + if (nextString == null) + break; + if (nextString.length() == 0) + continue; + out.print( +" "+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(nextString)+"
\n" + ); + } + } + finally + { + is.close(); + } + } + finally + { + str.close(); + } + } + catch (java.io.IOException e) + { + throw new ManifoldCFException("IO error: "+e.getMessage(),e); + } + out.print( +" \n"+ +" \n"+ +"
\n"+ +" \n"+ +" Exclude from crawl:\n"+ " \n" ); try @@ -4386,6 +4517,48 @@ public class WebcrawlerConnector extends out.print( " \n"+ " \n"+ +"
\n"+ +" \n"+ +" Exclude from index:\n"+ +" \n" + ); + try + { + java.io.Reader str = new java.io.StringReader(exclusionsIndex); + try + { + java.io.BufferedReader is = new java.io.BufferedReader(str); + try + { + while (true) + { + String nextString = is.readLine(); + if (nextString == null) + break; + if (nextString.length() == 0) + continue; + out.print( +" "+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(nextString)+"
\n" + ); + } + } + finally + { + is.close(); + } + } + finally + { + str.close(); + } + } + catch (java.io.IOException e) + { + throw new ManifoldCFException("IO error: "+e.getMessage(),e); + } + out.print( +" \n"+ +" \n"+ " \n"+ "
\n" ); @@ -5115,7 +5288,7 @@ public class WebcrawlerConnector extends /** Code to check if an already-fetched document should be ingested. */ - protected boolean isDataIngestable(IFingerprintActivity activities, String documentIdentifier) + protected boolean isDataIngestable(IFingerprintActivity activities, String documentIdentifier, DocumentURLFilter filter) throws ServiceInterruption, ManifoldCFException { if (cache.getResponseCode(documentIdentifier) != 200) @@ -5127,6 +5300,9 @@ public class WebcrawlerConnector extends if (activities.checkURLIndexable(documentIdentifier) == false) return false; + if (filter.isDocumentIndexable(documentIdentifier) == false) + return false; + // Check if it's a recognized content type String contentType = cache.getContentType(documentIdentifier); @@ -6901,14 +7077,20 @@ public class WebcrawlerConnector extends } } - /** This class describes the url filtering information obtained from a digested DocumentSpecification. + /** This class describes the url filtering information (for crawling and indexing) obtained from a digested DocumentSpecification. */ protected static class DocumentURLFilter { + /** The version string */ + protected String versionString; /** The arraylist of include patterns */ protected ArrayList includePatterns = new ArrayList(); /** The arraylist of exclude patterns */ protected ArrayList excludePatterns = new ArrayList(); + /** The arraylist of index include patterns */ + protected ArrayList includeIndexPatterns = new ArrayList(); + /** The arraylist of index exclude patterns */ + protected ArrayList excludeIndexPatterns = new ArrayList(); /** The hash map of seed hosts, to limit urls by, if non-null */ protected HashMap seedHosts = null; @@ -6923,8 +7105,10 @@ public class WebcrawlerConnector extends public DocumentURLFilter(DocumentSpecification spec) throws ManifoldCFException { - String includes = ""; + String includes = ".*"; String excludes = ""; + String includesIndex = ".*"; + String excludesIndex = ""; String seeds = ""; boolean limitToSeeds = false; int i = 0; @@ -6950,6 +7134,18 @@ public class WebcrawlerConnector extends if (excludes == null) excludes = ""; } + else if (sn.getType().equals(WebcrawlerConfig.NODE_INCLUDESINDEX)) + { + includesIndex = sn.getValue(); + if (includesIndex == null) + includesIndex = ""; + } + else if (sn.getType().equals(WebcrawlerConfig.NODE_EXCLUDESINDEX)) + { + excludesIndex = sn.getValue(); + if (excludesIndex == null) + excludesIndex = ""; + } else if (sn.getType().equals(WebcrawlerConfig.NODE_LIMITTOSEEDS)) { String value = sn.getAttributeValue(WebcrawlerConfig.ATTR_VALUE); @@ -7034,10 +7230,16 @@ public class WebcrawlerConnector extends } } + versionString = includesIndex + "+" + excludesIndex; + ArrayList list = stringToArray(includes); compileList(includePatterns,list); list = stringToArray(excludes); compileList(excludePatterns,list); + list = stringToArray(includesIndex); + compileList(includeIndexPatterns,list); + list = stringToArray(excludesIndex); + compileList(excludeIndexPatterns,list); if (limitToSeeds) { @@ -7072,6 +7274,14 @@ public class WebcrawlerConnector extends } } + /** Get whatever contribution to the version string should come from this data. + */ + public String getVersionString() + { + // In practice, this is NOT what controls the set that is spidered, but rather the set that is indexed + return versionString; + } + /** Check if both a document and host are legal. */ public boolean isDocumentAndHostLegal(String url) @@ -7147,6 +7357,45 @@ public class WebcrawlerConnector extends return true; } + /** Check if the document identifier is indexable. + */ + public boolean isDocumentIndexable(String url) + { + // First, verify that the url matches one of the patterns in the include list. + int i = 0; + while (i < includeIndexPatterns.size()) + { + Pattern p = (Pattern)includeIndexPatterns.get(i); + Matcher m = p.matcher(url); + if (m.find()) + break; + i++; + } + if (i == includeIndexPatterns.size()) + { + if (Logging.connectors.isDebugEnabled()) + Logging.connectors.debug("WEB: Url '"+url+"' is not indexable because no include patterns match it"); + return false; + } + + // Now make sure it's not in the exclude list. + i = 0; + while (i < excludeIndexPatterns.size()) + { + Pattern p = (Pattern)excludeIndexPatterns.get(i); + Matcher m = p.matcher(url); + if (m.find()) + { + if (Logging.connectors.isDebugEnabled()) + Logging.connectors.debug("WEB: Url '"+url+"' is not indexable because exclude pattern '"+p.toString()+"' matched it"); + return false; + } + i++; + } + + return true; + } + /** Get canonicalization policies */ public CanonicalizationPolicies getCanonicalizationPolicies() {