From droids-commits-return-226-apmail-incubator-droids-commits-archive=incubator.apache.org@incubator.apache.org Sun Aug 08 12:07:16 2010 Return-Path: Delivered-To: apmail-incubator-droids-commits-archive@minotaur.apache.org Received: (qmail 87876 invoked from network); 8 Aug 2010 12:07:16 -0000 Received: from unknown (HELO mail.apache.org) (140.211.11.3) by 140.211.11.9 with SMTP; 8 Aug 2010 12:07:16 -0000 Received: (qmail 14259 invoked by uid 500); 8 Aug 2010 12:07:16 -0000 Delivered-To: apmail-incubator-droids-commits-archive@incubator.apache.org Received: (qmail 14231 invoked by uid 500); 8 Aug 2010 12:07:15 -0000 Mailing-List: contact droids-commits-help@incubator.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: droids-dev@incubator.apache.org Delivered-To: mailing list droids-commits@incubator.apache.org Received: (qmail 14223 invoked by uid 99); 8 Aug 2010 12:07:14 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Sun, 08 Aug 2010 12:07:14 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=10.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Sun, 08 Aug 2010 12:07:13 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id D11F923889D5; Sun, 8 Aug 2010 12:05:56 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r983394 - in /incubator/droids/trunk/droids-core/src/main: java/org/apache/droids/protocol/http/ java/org/apache/droids/robot/crawler/ resources/ Date: Sun, 08 Aug 2010 12:05:56 -0000 To: droids-commits@incubator.apache.org From: olegk@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20100808120556.D11F923889D5@eris.apache.org> Author: olegk Date: Sun Aug 8 12:05:56 2010 New Revision: 983394 URL: http://svn.apache.org/viewvc?rev=983394&view=rev Log: Fixed NPE in CrawlingWorker; improved exception handling and logging in the HTTP transport Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/DroidsRequestRetryHandler.java incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpProtocol.java incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java incubator/droids/trunk/droids-core/src/main/resources/regex-urlfilter.txt Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/DroidsRequestRetryHandler.java URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/DroidsRequestRetryHandler.java?rev=983394&r1=983393&r2=983394&view=diff ============================================================================== --- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/DroidsRequestRetryHandler.java (original) +++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/DroidsRequestRetryHandler.java Sun Aug 8 12:05:56 2010 @@ -18,13 +18,13 @@ package org.apache.droids.protocol.http; import java.io.IOException; import java.io.InterruptedIOException; -import java.net.ConnectException; -import java.net.UnknownHostException; + import java.net.UnknownHostException; import javax.net.ssl.SSLHandshakeException; import org.apache.http.NoHttpResponseException; import org.apache.http.client.HttpRequestRetryHandler; +import org.apache.http.conn.HttpHostConnectException; import org.apache.http.protocol.HttpContext; class DroidsRequestRetryHandler implements HttpRequestRetryHandler @@ -68,7 +68,7 @@ class DroidsRequestRetryHandler implemen // Unknown host return false; } - if (exception instanceof ConnectException) { + if (exception instanceof HttpHostConnectException) { // Connection refused return false; } Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpProtocol.java URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpProtocol.java?rev=983394&r1=983393&r2=983394&view=diff ============================================================================== --- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpProtocol.java (original) +++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpProtocol.java Sun Aug 8 12:05:56 2010 @@ -76,7 +76,7 @@ public class HttpProtocol extends Loggab return new HttpContentEntity(entity, maxlen); } - public boolean isAllowed(URI uri) { + public boolean isAllowed(URI uri) throws IOException { if (forceAllow) { return forceAllow; } @@ -105,14 +105,10 @@ public class HttpProtocol extends Loggab } catch (NoRobotException ex) { log.error("Failure parsing robots.txt: " + ex.getMessage()); return false; - } catch (IOException ex) { - log.error("I/O error parsing robots.txt: " + ex.getMessage()); - return false; } boolean test = nrc.isUrlAllowed(uri); - String message = (test) ? "allowed" : "denied"; if (log.isInfoEnabled()) { - log.info("Url is " + message); + log.info(uri + " is " + (test ? "allowed" : "denied")); } return test; } Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java?rev=983394&r1=983393&r2=983394&view=diff ============================================================================== --- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java (original) +++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java Sun Aug 8 12:05:56 2010 @@ -52,6 +52,13 @@ public class CrawlingWorker extends Logg } URI uri = link.getURI(); final Protocol protocol = droid.getProtocolFactory().getProtocol(uri); + if (protocol == null) { + if (log.isWarnEnabled()) { + log.warn("Unsupported protocol scheme '" + uri.getScheme() + "'"); + } + return; + } + if (protocol.isAllowed(uri)) { if (log.isInfoEnabled()) { log.info("Loading " + uri); @@ -87,8 +94,10 @@ public class CrawlingWorker extends Logg } } else { - log.info("Stopping processing since" - + " bots are not allowed for this url."); + if (log.isInfoEnabled()) { + log.info("Stopping processing since" + + " bots are not allowed for " + uri ); + } } } Modified: incubator/droids/trunk/droids-core/src/main/resources/regex-urlfilter.txt URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/resources/regex-urlfilter.txt?rev=983394&r1=983393&r2=983394&view=diff ============================================================================== --- incubator/droids/trunk/droids-core/src/main/resources/regex-urlfilter.txt (original) +++ incubator/droids/trunk/droids-core/src/main/resources/regex-urlfilter.txt Sun Aug 8 12:05:56 2010 @@ -23,7 +23,7 @@ # matches, the URL is ignored. # skip file: ftp: and mailto: urls --^(ftp|mailto): +-^(ftp|mailto|irc): # skip URLs containing certain characters as probable queries, etc. -[*!@#]