manifoldcf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1840145 - /manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Date Wed, 05 Sep 2018 16:42:37 GMT
Author: kwright
Date: Wed Sep  5 16:42:37 2018
New Revision: 1840145

URL: http://svn.apache.org/viewvc?rev=1840145&view=rev
Log:
CONNECTORS-1528: Strip out duplicate slashes from path

Modified:
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1840145&r1=1840144&r2=1840145&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
(original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Wed Sep  5 16:42:37 2018
@@ -3670,6 +3670,12 @@ public class WebcrawlerConnector extends
 
     }
 
+    // Remove duplicate path slashes.  This is gated by the "lowercase" selection, since
it's also an IIS-specific problem.
+    if (p != null && p.canLowercase())
+    {
+      pathString = filterMultipleSlashes(pathString);
+    }
+    
     // Put it back into the URL without the ref, and with the modified query and path parts.
     url = new WebURL(url.getScheme(),url.getHost(),url.getPort(),pathString,queryString);
     String rval = url.toASCIIString();
@@ -3681,6 +3687,19 @@ public class WebcrawlerConnector extends
     return rval;
   }
 
+  private static String filterMultipleSlashes(String pathString) {
+    // Not terribly efficient unless there are almost never duplicate slashes
+    while (true)
+    {
+      final int index = pathString.indexOf("//");
+      if (index == -1)
+      {
+        return pathString;
+      }
+      pathString = pathString.substring(0, index) + pathString.substring(index + 1);
+    }
+  }
+  
   /** Code to check if data is interesting, based on response code and content type.
   */
   protected boolean isContentInteresting(IFingerprintActivity activities, String documentIdentifier,
int response, String contentType)



Mime
View raw message