manifoldcf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1066559 - in /incubator/lcf/trunk: CHANGES.txt connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Date Wed, 02 Feb 2011 18:14:08 GMT
Author: kwright
Date: Wed Feb  2 18:14:07 2011
New Revision: 1066559

URL: http://svn.apache.org/viewvc?rev=1066559&view=rev
Log:
Fix for CONNECTORS-157, and more fixes for CONNECTORS-153.

Modified:
    incubator/lcf/trunk/CHANGES.txt
    incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java

Modified: incubator/lcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/CHANGES.txt?rev=1066559&r1=1066558&r2=1066559&view=diff
==============================================================================
--- incubator/lcf/trunk/CHANGES.txt (original)
+++ incubator/lcf/trunk/CHANGES.txt Wed Feb  2 18:14:07 2011
@@ -3,6 +3,10 @@ $Id$
 
 ==================  0.2-dev ==================
 
+CONNECTORS-157: Web crawler url resolution was broken for relative
+paths, because the way java.net.URI resolved them changed.
+(Karl Wright)
+
 CONNECTORS-156: Update site to describe work-around instructions.
 (Karl Wright)
 

Modified: incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1066559&r1=1066558&r2=1066559&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
(original)
+++ incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Wed Feb  2 18:14:07 2011
@@ -1140,7 +1140,7 @@ public class WebcrawlerConnector extends
         if (indexDocument)
           indexDocument = isDataIngestable(activities,documentIdentifier);
 
-        if (isDataIngestable(activities,documentIdentifier))
+        if (indexDocument)
         {
           // Ingest the document
           if (Logging.connectors.isDebugEnabled())
@@ -4771,13 +4771,24 @@ public class WebcrawlerConnector extends
     try
     {
       java.net.URI url;
+      java.net.URI rawPiece = new java.net.URI(rawURL);
       if (parentIdentifier != null)
       {
-        java.net.URI parentURL = new java.net.URI(parentIdentifier);
-        url = parentURL.resolve(rawURL);
+        // Work around bug in java.net.URI.resolve().  Relative paths do not work
+        // here; we must make them absolute somehow.
+        if (rawPiece.isAbsolute())
+          url = rawPiece;
+        else
+        {
+          java.net.URI parentURL = new java.net.URI(parentIdentifier);
+          if (!rawURL.startsWith("/"))
+            url = parentURL.resolve("/"+rawURL);
+          else
+            url = parentURL.resolve(rawPiece);
+        }
       }
       else
-        url = new java.net.URI(rawURL);
+        url = rawPiece;
 
       String protocol = url.getScheme();
       String host = url.getHost();
@@ -4834,6 +4845,7 @@ public class WebcrawlerConnector extends
     }
     catch (java.net.URISyntaxException e)
     {
+      e.printStackTrace();
       if (Logging.connectors.isDebugEnabled())
         Logging.connectors.debug("WEB: Can't use url '"+rawURL+"' because it is badly formed:
"+e.getMessage());
       return null;
@@ -5636,11 +5648,11 @@ public class WebcrawlerConnector extends
           // Parse content value
           try
           {
-            String[] contentValues = contentValue.split("[, ]");
+            String[] contentValues = contentValue.split(",");
             int i = 0;
             while (i < contentValues.length)
             {
-              String cv = contentValues[i++];
+              String cv = contentValues[i++].trim();
               if (cv.equals("index"))
                 allowIndex = true;
               else if (cv.equals("noindex"))



Mime
View raw message