incubator-connectors-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1158476 - in /incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss: RSSConnector.java WebURL.java
Date Wed, 17 Aug 2011 00:19:48 GMT
Author: kwright
Date: Wed Aug 17 00:19:47 2011
New Revision: 1158476

URL: http://svn.apache.org/viewvc?rev=1158476&view=rev
Log:
Replace java.net.URI with my own WebURL wrapper class that corrects known problems with the
former.

Added:
    incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/WebURL.java
  (with props)
Modified:
    incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java

Modified: incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java?rev=1158476&r1=1158475&r2=1158476&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
(original)
+++ incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
Wed Aug 17 00:19:47 2011
@@ -365,7 +365,7 @@ public class RSSConnector extends org.ap
   {
     try
     {
-      java.net.URI uri = new java.net.URI(documentIdentifier);
+      WebURL uri = new WebURL(documentIdentifier);
       return new String[]{uri.getHost()};
     }
     catch (URISyntaxException e)
@@ -448,14 +448,14 @@ public class RSSConnector extends org.ap
       }
       rawURL = sb.toString();
 
-      java.net.URI url;
+      WebURL url;
       if (parentIdentifier != null)
       {
-        java.net.URI parentURL = new java.net.URI(parentIdentifier);
+        WebURL parentURL = new WebURL(parentIdentifier);
         url = parentURL.resolve(rawURL);
       }
       else
-        url = new java.net.URI(rawURL);
+        url = new WebURL(rawURL);
 
       String protocol = url.getScheme();
       String host = url.getHost();
@@ -527,7 +527,7 @@ public class RSSConnector extends org.ap
 
   /** Code to canonicalize a URL.  If URL cannot be canonicalized (and is illegal) return
null.
   */
-  protected static String doCanonicalization(CanonicalizationPolicy p, java.net.URI url)
+  protected static String doCanonicalization(CanonicalizationPolicy p, WebURL url)
     throws ManifoldCFException, java.net.URISyntaxException
   {
     // Note well: The java.net.URI class mistreats the query part of the URI, near as I can
tell, in the following ways:
@@ -709,14 +709,8 @@ public class RSSConnector extends org.ap
     }
 
     // Put it back into the URL without the ref, and with the modified query and path parts.
-    url = new java.net.URI(url.getScheme(),null,url.getHost(),url.getPort(),pathString,null,null);
+    url = new WebURL(url.getScheme(),url.getHost(),url.getPort(),pathString,queryString);
     String rval = url.toASCIIString();
-    // If there's a non-empty query string, append it to the url using our own logic; this
is necessary because java.net.URI is broken as far as query escaping
-    // goes.
-    if (rval != null && queryString != null && queryString.length() >
0)
-    {
-      rval += "?" + queryString;
-    }
     return rval;
   }
 

Added: incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/WebURL.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/WebURL.java?rev=1158476&view=auto
==============================================================================
--- incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/WebURL.java
(added)
+++ incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/WebURL.java
Wed Aug 17 00:19:47 2011
@@ -0,0 +1,114 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.rss;
+
+import java.net.URI;
+import java.net.URISyntaxException;
+
+/** Replacement class for java.net.URI, which is broken in many ways.
+*/
+public class WebURL
+{
+  protected URI theURL;
+  protected String rawQueryPart;
+  
+  public WebURL(String url)
+    throws URISyntaxException
+  {
+    theURL = new URI(url);
+    rawQueryPart = null;
+  }
+  
+  public WebURL(String scheme, String host, int port, String path, String queryPart)
+    throws URISyntaxException
+  {
+    theURL = new URI(scheme, null, host, port, path, null, null);
+    rawQueryPart = queryPart;
+  }
+  
+  public WebURL(URI theURL)
+  {
+    this(theURL,null);
+  }
+  
+  public WebURL(URI theURL, String rawQueryPart)
+  {
+    this.theURL = theURL;
+    this.rawQueryPart = rawQueryPart;
+  }
+  
+  public WebURL resolve(String raw)
+    throws URISyntaxException
+  {
+    URI rawURL = new URI(raw);
+    if (rawURL.isAbsolute())
+      return new WebURL(rawURL);
+    URI fixedURL = theURL;
+    if (theURL.getPath() == null || theURL.getPath().length() == 0)
+      fixedURL = new URI(theURL.getScheme(),null,theURL.getHost(),theURL.getPort(),"/",null,null);
+
+    if (raw.startsWith("?"))
+      return new WebURL(fixedURL.getScheme(),fixedURL.getHost(),fixedURL.getPort(),fixedURL.getPath(),rawURL.getRawQuery());
+    
+    return new WebURL(fixedURL.resolve(rawURL));
+  }
+  
+  public String getPath()
+  {
+    return theURL.getPath();
+  }
+  
+  public String getHost()
+  {
+    return theURL.getHost();
+  }
+  
+  public String getScheme()
+  {
+    return theURL.getScheme();
+  }
+  
+  public int getPort()
+  {
+    return theURL.getPort();
+  }
+  
+  public String getRawQuery()
+  {
+    if (rawQueryPart != null)
+      return rawQueryPart;
+    return theURL.getRawQuery();
+  }
+  
+  public String toASCIIString()
+  {
+    String rval = theURL.toASCIIString();
+    if (rval != null && rawQueryPart != null && rawQueryPart.length() >
0)
+      rval += "?" + rawQueryPart;
+    return rval;
+  }
+  
+  public String toString()
+  {
+    String rval = theURL.toString();
+    if (rval != null && rawQueryPart != null && rawQueryPart.length() >
0)
+      rval += "?" + rawQueryPart;
+    return rval;
+  }
+}

Propchange: incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/WebURL.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/WebURL.java
------------------------------------------------------------------------------
    svn:keywords = Id



Mime
View raw message