incubator-connectors-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1158475 - in /incubator/lcf/trunk: ./ connectors/webcrawler/ connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ connectors/webcrawler/connector/src/test/java/org/ connectors/webcrawler/connec...
Date Wed, 17 Aug 2011 00:08:43 GMT
Author: kwright
Date: Wed Aug 17 00:08:43 2011
New Revision: 1158475

URL: http://svn.apache.org/viewvc?rev=1158475&view=rev
Log:
More changes under ticket CONNECTORS-157.  Another case was detected where the java.net.URI
code did the wrong thing: specifically when URL was relative with respect to the query string.

Added:
    incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebURL.java
  (with props)
    incubator/lcf/trunk/connectors/webcrawler/connector/src/test/java/org/
    incubator/lcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/
    incubator/lcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/
    incubator/lcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/
    incubator/lcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/
    incubator/lcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/
    incubator/lcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/tests/
    incubator/lcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/tests/URLTest.java
  (with props)
Modified:
    incubator/lcf/trunk/CHANGES.txt
    incubator/lcf/trunk/connectors/webcrawler/build.xml
    incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java

Modified: incubator/lcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/CHANGES.txt?rev=1158475&r1=1158474&r2=1158475&view=diff
==============================================================================
--- incubator/lcf/trunk/CHANGES.txt (original)
+++ incubator/lcf/trunk/CHANGES.txt Wed Aug 17 00:08:43 2011
@@ -3,6 +3,11 @@ $Id$
 
 ======================= 0.3-dev =========================
 
+CONNECTORS-157: Fixed a second kind of case where the java.net.URI
+class is broken.  Relative queries starting with "?" now replace the query
+part of the url.
+(David Broadfoot, Karl Wright)
+
 CONNECTORS-239: RSS connector chromed content mode was broken.
 (Kate McGonigal, Karl Wright)
 

Modified: incubator/lcf/trunk/connectors/webcrawler/build.xml
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/webcrawler/build.xml?rev=1158475&r1=1158474&r2=1158475&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/webcrawler/build.xml (original)
+++ incubator/lcf/trunk/connectors/webcrawler/build.xml Wed Aug 17 00:08:43 2011
@@ -51,11 +51,46 @@
         <copy todir="dist/lib" file="build/jar/mcf-web-connector.jar"/>
     </target>
 
-    <target name="build" depends="lib"/>
-    <target name="build-tests">
-      <mkdir dir="build/test-jar"/>
+    <target name="compile-connector-tests" depends="compile-connector">
+        <mkdir dir="build/connector-tests/classes"/>
+        <javac srcdir="connector/src/test/java" destdir="build/connector-tests/classes"
target="1.5" source="1.5" debug="true" debuglevel="lines,vars,source">
+            <classpath>
+                 <fileset dir="lib"> 
+                    <include name="*.jar"/> 
+                </fileset>
+                <pathelement location="build/connector/classes"/>
+            </classpath>
+        </javac>
+    </target>
+    
+    <target name="compile-tests" depends="compile-connector-tests"/>
+    
+    <target name="jar-connector-tests" depends="compile-connector-tests">
+        <mkdir dir="build/test-jar"/>
+        <jar destfile="build/test-jar/mcf-filesystem-connector-tests.jar" basedir="build/connector-tests/classes"/>
     </target>
-    <target name="run-tests"/>
+
+    <target name="jar-tests" depends="jar-connector-tests"/>
+    
+    <target name="run-tests" depends="compile-tests">
+        <mkdir dir="test-output"/>
+        <junit fork="true" maxmemory="128m" dir="test-output" showoutput="true" haltonfailure="true">
+            <classpath>
+                <fileset dir="lib">
+                    <include name="*.jar"/>
+                </fileset>
+                <pathelement location="build/connector/classes"/>
+                <pathelement location="build/connector-tests/classes"/>
+            </classpath>
+            <test name="org.apache.manifoldcf.crawler.connectors.webcrawler.tests.URLTest"
todir="test-output"/>
+            <!-- MHL -->
+        </junit>
+    </target>
+
+    <target name="build" depends="lib"/>
+    
+    <target name="build-tests" depends="jar-tests"/>
+    
     <target name="run-tests-postgresql"/>
     <target name="run-tests-HSQLDB"/>
     <target name="all" depends="build,doc,build-tests,run-tests,run-tests-postgresql,run-tests-HSQLDB"/>

Added: incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebURL.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebURL.java?rev=1158475&view=auto
==============================================================================
--- incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebURL.java
(added)
+++ incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebURL.java
Wed Aug 17 00:08:43 2011
@@ -0,0 +1,114 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+import java.net.URI;
+import java.net.URISyntaxException;
+
+/** Replacement class for java.net.URI, which is broken in many ways.
+*/
+public class WebURL
+{
+  protected URI theURL;
+  protected String rawQueryPart;
+  
+  public WebURL(String url)
+    throws URISyntaxException
+  {
+    theURL = new URI(url);
+    rawQueryPart = null;
+  }
+  
+  public WebURL(String scheme, String host, int port, String path, String queryPart)
+    throws URISyntaxException
+  {
+    theURL = new URI(scheme, null, host, port, path, null, null);
+    rawQueryPart = queryPart;
+  }
+  
+  public WebURL(URI theURL)
+  {
+    this(theURL,null);
+  }
+  
+  public WebURL(URI theURL, String rawQueryPart)
+  {
+    this.theURL = theURL;
+    this.rawQueryPart = rawQueryPart;
+  }
+  
+  public WebURL resolve(String raw)
+    throws URISyntaxException
+  {
+    URI rawURL = new URI(raw);
+    if (rawURL.isAbsolute())
+      return new WebURL(rawURL);
+    URI fixedURL = theURL;
+    if (theURL.getPath() == null || theURL.getPath().length() == 0)
+      fixedURL = new URI(theURL.getScheme(),null,theURL.getHost(),theURL.getPort(),"/",null,null);
+
+    if (raw.startsWith("?"))
+      return new WebURL(fixedURL.getScheme(),fixedURL.getHost(),fixedURL.getPort(),fixedURL.getPath(),rawURL.getRawQuery());
+    
+    return new WebURL(fixedURL.resolve(rawURL));
+  }
+  
+  public String getPath()
+  {
+    return theURL.getPath();
+  }
+  
+  public String getHost()
+  {
+    return theURL.getHost();
+  }
+  
+  public String getScheme()
+  {
+    return theURL.getScheme();
+  }
+  
+  public int getPort()
+  {
+    return theURL.getPort();
+  }
+  
+  public String getRawQuery()
+  {
+    if (rawQueryPart != null)
+      return rawQueryPart;
+    return theURL.getRawQuery();
+  }
+  
+  public String toASCIIString()
+  {
+    String rval = theURL.toASCIIString();
+    if (rval != null && rawQueryPart != null && rawQueryPart.length() >
0)
+      rval += "?" + rawQueryPart;
+    return rval;
+  }
+  
+  public String toString()
+  {
+    String rval = theURL.toString();
+    if (rval != null && rawQueryPart != null && rawQueryPart.length() >
0)
+      rval += "?" + rawQueryPart;
+    return rval;
+  }
+}

Propchange: incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebURL.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebURL.java
------------------------------------------------------------------------------
    svn:keywords = Id

Modified: incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1158475&r1=1158474&r2=1158475&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
(original)
+++ incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Wed Aug 17 00:08:43 2011
@@ -4952,24 +4952,14 @@ public class WebcrawlerConnector extends
   {
     try
     {
-      java.net.URI url;
-      java.net.URI rawPiece = new java.net.URI(rawURL);
+      WebURL url;
       if (parentIdentifier != null)
       {
-        // Work around bug in java.net.URI.resolve().  Relative paths do not work
-        // here; we must make them absolute somehow.
-        if (rawPiece.isAbsolute())
-          url = rawPiece;
-        else
-        {
-          java.net.URI parentURL = new java.net.URI(parentIdentifier);
-          if (parentURL.getPath() == null || parentURL.getPath().length() == 0)
-            parentURL = new java.net.URI(parentIdentifier + "/");
-          url = parentURL.resolve(rawPiece);
-        }
+        WebURL parentURL = new WebURL(parentIdentifier);
+        url = parentURL.resolve(rawURL);
       }
       else
-        url = rawPiece;
+        url = new WebURL(rawURL);
 
       String protocol = url.getScheme();
       String host = url.getHost();
@@ -5043,7 +5033,7 @@ public class WebcrawlerConnector extends
 
   /** Code to canonicalize a URL.  If URL cannot be canonicalized (and is illegal) return
null.
   */
-  protected String doCanonicalization(DocumentURLFilter filter, java.net.URI url)
+  protected String doCanonicalization(DocumentURLFilter filter, WebURL url)
     throws ManifoldCFException, java.net.URISyntaxException
   {
     // First, we have to figure out what the canonicalization policy is.
@@ -5052,13 +5042,9 @@ public class WebcrawlerConnector extends
     String pathString = url.getPath();
     String queryString = url.getRawQuery();
 
-    java.net.URI rawURI = new java.net.URI(url.getScheme(),null,url.getHost(),url.getPort(),pathString,null,null);
+    WebURL rawURI = new WebURL(url.getScheme(),url.getHost(),url.getPort(),pathString,queryString);
     String completeRawURL = rawURI.toASCIIString();
 
-    if (completeRawURL != null && queryString != null && queryString.length()
> 0)
-    {
-      completeRawURL += "?" + queryString;
-    }
     CanonicalizationPolicy p;
     if (completeRawURL != null)
       p = filter.getCanonicalizationPolicies().findMatch(completeRawURL);
@@ -5245,14 +5231,8 @@ public class WebcrawlerConnector extends
     }
 
     // Put it back into the URL without the ref, and with the modified query and path parts.
-    url = new java.net.URI(url.getScheme(),null,url.getHost(),url.getPort(),pathString,null,null);
+    url = new WebURL(url.getScheme(),url.getHost(),url.getPort(),pathString,queryString);
     String rval = url.toASCIIString();
-    // If there's a non-empty query string, append it to the url using our own logic; this
is necessary because java.net.URI is broken as far as query escaping
-    // goes.
-    if (rval != null && queryString != null && queryString.length() >
0)
-    {
-      rval += "?" + queryString;
-    }
     return rval;
   }
 

Added: incubator/lcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/tests/URLTest.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/tests/URLTest.java?rev=1158475&view=auto
==============================================================================
--- incubator/lcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/tests/URLTest.java
(added)
+++ incubator/lcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/tests/URLTest.java
Wed Aug 17 00:08:43 2011
@@ -0,0 +1,74 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.webcrawler.tests;
+
+import org.apache.manifoldcf.crawler.connectors.webcrawler.WebURL;
+import org.junit.*;
+import static org.junit.Assert.*;
+
+public class URLTest
+{
+
+  @Test
+  public void absolutePath()
+    throws Exception
+  {
+    WebURL parent = new WebURL("http://foo.com");
+    WebURL resolved = parent.resolve("http://bar.com");
+    assertEquals(resolved.toASCIIString(),"http://bar.com");
+  }
+
+  @Test
+  public void relativePath()
+    throws Exception
+  {
+    WebURL parent = new WebURL("http://foo.com/abc/def.html");
+    WebURL resolved = parent.resolve("/def/ghi.html");
+    assertEquals(resolved.toASCIIString(),"http://foo.com/def/ghi.html");
+  }
+
+  @Test
+  public void noSlashDocument()
+    throws Exception
+  {
+    WebURL parent = new WebURL("http://foo.com");
+    WebURL resolved = parent.resolve("hello.pdf");
+    assertEquals(resolved.toASCIIString(),"http://foo.com/hello.pdf");
+  }
+
+  @Test
+  public void relativeQuery()
+    throws Exception
+  {
+    WebURL parent = new WebURL("http://foo.com/abc/def/ghi.asmx?q=foo");
+    WebURL resolved = parent.resolve("?q=bar");
+    assertEquals(resolved.toASCIIString(),"http://foo.com/abc/def/ghi.asmx?q=bar");
+  }
+
+  @Test
+  public void queryEscaping()
+    throws Exception
+  {
+    WebURL parent = new WebURL("http://foo.com/abc/def/ghi.asmx?q=foo%3Dbar");
+    WebURL resolved = parent.resolve("?q=bar%3Dfoo");
+    assertEquals(resolved.toASCIIString(),"http://foo.com/abc/def/ghi.asmx?q=bar%3Dfoo");
+  }
+
+
+}

Propchange: incubator/lcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/tests/URLTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: incubator/lcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/tests/URLTest.java
------------------------------------------------------------------------------
    svn:keywords = Id



Mime
View raw message