incubator-droids-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ol...@apache.org
Subject svn commit: r983394 - in /incubator/droids/trunk/droids-core/src/main: java/org/apache/droids/protocol/http/ java/org/apache/droids/robot/crawler/ resources/
Date Sun, 08 Aug 2010 12:05:56 GMT
Author: olegk
Date: Sun Aug  8 12:05:56 2010
New Revision: 983394

URL: http://svn.apache.org/viewvc?rev=983394&view=rev
Log:
Fixed NPE in CrawlingWorker; improved exception handling and logging in the HTTP transport

Modified:
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/DroidsRequestRetryHandler.java
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpProtocol.java
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java
    incubator/droids/trunk/droids-core/src/main/resources/regex-urlfilter.txt

Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/DroidsRequestRetryHandler.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/DroidsRequestRetryHandler.java?rev=983394&r1=983393&r2=983394&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/DroidsRequestRetryHandler.java
(original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/DroidsRequestRetryHandler.java
Sun Aug  8 12:05:56 2010
@@ -18,13 +18,13 @@ package org.apache.droids.protocol.http;
 
 import java.io.IOException;
 import java.io.InterruptedIOException;
-import java.net.ConnectException;
-import java.net.UnknownHostException;
+  import java.net.UnknownHostException;
 
 import javax.net.ssl.SSLHandshakeException;
 
 import org.apache.http.NoHttpResponseException;
 import org.apache.http.client.HttpRequestRetryHandler;
+import org.apache.http.conn.HttpHostConnectException;
 import org.apache.http.protocol.HttpContext;
 
 class DroidsRequestRetryHandler implements HttpRequestRetryHandler
@@ -68,7 +68,7 @@ class DroidsRequestRetryHandler implemen
       // Unknown host
       return false;
     }
-    if (exception instanceof ConnectException) {
+    if (exception instanceof HttpHostConnectException) {
       // Connection refused
       return false;
     }

Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpProtocol.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpProtocol.java?rev=983394&r1=983393&r2=983394&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpProtocol.java
(original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpProtocol.java
Sun Aug  8 12:05:56 2010
@@ -76,7 +76,7 @@ public class HttpProtocol extends Loggab
     return new HttpContentEntity(entity, maxlen);
   }
 
-  public boolean isAllowed(URI uri) {
+  public boolean isAllowed(URI uri) throws IOException {
     if (forceAllow) {
       return forceAllow;
     }
@@ -105,14 +105,10 @@ public class HttpProtocol extends Loggab
     } catch (NoRobotException ex) {
       log.error("Failure parsing robots.txt: " + ex.getMessage());
       return false;
-    } catch (IOException ex) {
-      log.error("I/O error parsing robots.txt: " + ex.getMessage());
-      return false;
     }
     boolean test = nrc.isUrlAllowed(uri);
-    String message = (test) ? "allowed" : "denied";
     if (log.isInfoEnabled()) {
-      log.info("Url is " + message);
+      log.info(uri + " is " + (test ? "allowed" : "denied"));
     }
     return test;
   }

Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java?rev=983394&r1=983393&r2=983394&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java
(original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java
Sun Aug  8 12:05:56 2010
@@ -52,6 +52,13 @@ public class CrawlingWorker extends Logg
     }
     URI uri = link.getURI();
     final Protocol protocol = droid.getProtocolFactory().getProtocol(uri);
+    if (protocol == null) {
+      if (log.isWarnEnabled()) {
+        log.warn("Unsupported protocol scheme '" + uri.getScheme() + "'");
+      }
+      return;
+    }
+    
     if (protocol.isAllowed(uri)) {
       if (log.isInfoEnabled()) {
         log.info("Loading " + uri);
@@ -87,8 +94,10 @@ public class CrawlingWorker extends Logg
       }
     } 
     else {
-      log.info("Stopping processing since"
-          + " bots are not allowed for this url.");
+      if (log.isInfoEnabled()) {
+        log.info("Stopping processing since"
+            + " bots are not allowed for " + uri );
+      }
     }
   }
   

Modified: incubator/droids/trunk/droids-core/src/main/resources/regex-urlfilter.txt
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/resources/regex-urlfilter.txt?rev=983394&r1=983393&r2=983394&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/resources/regex-urlfilter.txt (original)
+++ incubator/droids/trunk/droids-core/src/main/resources/regex-urlfilter.txt Sun Aug  8 12:05:56
2010
@@ -23,7 +23,7 @@
 # matches, the URL is ignored.
 
 # skip file: ftp: and mailto: urls
--^(ftp|mailto):
+-^(ftp|mailto|irc):
 
 # skip URLs containing certain characters as probable queries, etc.
 -[*!@#]



Mime
View raw message