incubator-droids-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ol...@apache.org
Subject svn commit: r713691 - in /incubator/droids/trunk/droids-core: ./ src/main/java/org/apache/droids/api/ src/main/java/org/apache/droids/exception/ src/main/java/org/apache/droids/helper/factories/ src/main/java/org/apache/droids/net/ src/main/java/org/ap...
Date Thu, 13 Nov 2008 09:47:56 GMT
Author: olegk
Date: Thu Nov 13 01:47:55 2008
New Revision: 713691

URL: http://svn.apache.org/viewvc?rev=713691&view=rev
Log:
API changes:
* Added ContentEntity interface representing a body of content retrieved from a URI
* Replaced default HTTP protocol implementation based on URLConnection with one based on HttpClient
4.0

Added:
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/ContentEntity.java
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpClientContentLoader.java
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpContentEntity.java
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpProtocol.java
Removed:
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/net/UrlHelper.java
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/HttpBase.java
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/MediaType.java
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/Http.java
Modified:
    incubator/droids/trunk/droids-core/pom.xml
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Protocol.java
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/exception/ProtocolNotFoundException.java
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/helper/factories/HandlerFactory.java
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/helper/factories/ProtocolFactory.java
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/file/FileProtocol.java
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/DroidsHttpClient.java
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java
    incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/DroidsFactory.java
    incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/IndexerWorker.java
    incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/SimpleRuntime.java
    incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/localserver/ResourceHandler.java

Modified: incubator/droids/trunk/droids-core/pom.xml
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/pom.xml?rev=713691&r1=713690&r2=713691&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/pom.xml (original)
+++ incubator/droids/trunk/droids-core/pom.xml Thu Nov 13 01:47:55 2008
@@ -52,10 +52,6 @@
       <artifactId>commons-logging</artifactId>
     </dependency>
     <dependency>
-      <groupId>commons-io</groupId>
-      <artifactId>commons-io</artifactId>
-    </dependency>
-    <dependency>
       <groupId>org.apache.geronimo.specs</groupId>
       <artifactId>geronimo-stax-api_1.0_spec</artifactId>
     </dependency>

Added: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/ContentEntity.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/ContentEntity.java?rev=713691&view=auto
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/ContentEntity.java
(added)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/ContentEntity.java
Thu Nov 13 01:47:55 2008
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.api;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Abstract interface representing a body of content with a particular 
+ * MIME type and an optional charset.
+ * <p>
+ * IMPORTANT: implementations of this interface MUST ensure that the content
+ * is repeatable, that is, the content can be consumed more than once. 
+ * <p>
+ * IMPORTANT: The consumer of the entity content MUST close the input stream 
+ * returned by {@link #obtainContent()} when finished reading the content. 
+ * The consumer MUST call {@link #finish()} when the entity is no longer 
+ * needed in order to release underlying resources held by the entity. 
+ * 
+ * @version 1.0
+ */
+public interface ContentEntity {
+
+  /**
+   * Returns content of the entity as an input stream. This input stream
+   * MUST be closed by the consumer when finished reading content.
+   * <p/>
+   * IMPORTANT: This method MUST return a new instance of {@link InputStream}
+   * to ensure the content can be consumed miore than once.
+   *  
+   * @return input stream
+   * @throws IOException
+   */
+  InputStream obtainContent() throws IOException;
+  
+  /**
+   * Returns MIME type of the entity.
+   * 
+   * @return MIME type
+   */
+  String getMimeType();
+  
+  /**
+   * Returns charset of the entity if known. Otherwise returns 
+   * <code>null</null>.
+   * 
+   * @return charset
+   */
+  String getCharset();
+  
+  /**
+   * Release all underlying resources held by the entity.
+   */
+  void finish();
+  
+}
\ No newline at end of file

Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java?rev=713691&r1=713690&r2=713691&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java (original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java Thu
Nov 13 01:47:55 2008
@@ -17,7 +17,6 @@
 package org.apache.droids.api;
 
 import java.io.IOException;
-import java.io.InputStream;
 
 import org.apache.droids.exception.DroidsException;
 
@@ -38,5 +37,5 @@
    *                the link that correspond to the stream
    * @return the parse object
    */
-  Parse getParse(InputStream openStream, Link link) throws DroidsException, IOException;
+  Parse getParse(ContentEntity entity, Link link) throws DroidsException, IOException;
 }

Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Protocol.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Protocol.java?rev=713691&r1=713690&r2=713691&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Protocol.java (original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Protocol.java Thu
Nov 13 01:47:55 2008
@@ -17,8 +17,8 @@
 package org.apache.droids.api;
 
 import java.io.IOException;
-import java.io.InputStream;
 import java.net.MalformedURLException;
+import java.net.URI;
 
 /**
  * The protocol interface is a wrapper to hide the underlying implementation of
@@ -37,24 +37,16 @@
    * @return true if we can request the url. false if we are forbidden.
    * @throws MalformedURLException
    */
-  boolean isAllowed(String url) throws MalformedURLException;
+  boolean isAllowed(URI url) throws IOException;
 
   /**
-   * Return the stream represent of the url
+   * Return the content entity represent of the url
    * 
    * @param url
    *                url of the stream we want to open
-   * @return the stream of the given url
+   * @return the content of the given url
    * @throws IOException
    */
-  InputStream openStream(String url) throws IOException;
+  ContentEntity load(URI uri) throws IOException;
 
-  /**
-   * Returns the content type of the url
-   * 
-   * @param url
-   *                url to evaluate
-   * @return registered content type
-   */
-  String getContentType(String url);
 }
\ No newline at end of file

Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/exception/ProtocolNotFoundException.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/exception/ProtocolNotFoundException.java?rev=713691&r1=713690&r2=713691&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/exception/ProtocolNotFoundException.java
(original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/exception/ProtocolNotFoundException.java
Thu Nov 13 01:47:55 2008
@@ -16,6 +16,8 @@
  */
 package org.apache.droids.exception;
 
+import java.net.URI;
+
 /**
  * If we do not have any instance of a protocol registered for the iven url.
  * 
@@ -24,7 +26,7 @@
  */
 public class ProtocolNotFoundException extends DroidsException {
   private static final long serialVersionUID = 6980937469875896426L;
-  private String url = null;
+  private URI uri = null;
 
   /**
    * Create an exception for the given url
@@ -32,8 +34,8 @@
    * @param url
    *                url where we do not have a suitable protocol
    */
-  public ProtocolNotFoundException(String url) {
-    this(url, "protocol not found for url=" + url);
+  public ProtocolNotFoundException(URI uri) {
+    this(uri, "protocol not found for uri=" + uri);
   }
 
   /**
@@ -44,9 +46,9 @@
    * @param message
    *                detailed message to explain the underlying cause
    */
-  public ProtocolNotFoundException(String url, String message) {
+  public ProtocolNotFoundException(URI uri, String message) {
     super(message);
-    this.url = url;
+    this.uri = uri;
   }
 
   /**
@@ -54,7 +56,7 @@
    * 
    * @return url which has caused the problem
    */
-  public String getUrl() {
-    return url;
+  public URI getUri() {
+    return uri;
   }
 }

Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/helper/factories/HandlerFactory.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/helper/factories/HandlerFactory.java?rev=713691&r1=713690&r2=713691&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/helper/factories/HandlerFactory.java
(original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/helper/factories/HandlerFactory.java
Thu Nov 13 01:47:55 2008
@@ -16,13 +16,11 @@
  */
 package org.apache.droids.helper.factories;
 
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URI;
 
-import org.apache.commons.io.IOUtils;
+import org.apache.droids.api.ContentEntity;
 import org.apache.droids.api.Handler;
 import org.apache.droids.api.Parse;
 import org.apache.droids.exception.DroidsException;
@@ -47,21 +45,15 @@
    *                the underlying parse object
    * @return false if we found a problem, true if all went well
    */
-  public boolean handle(InputStream stream, URI uri, Parse parse) 
+  public boolean handle(ContentEntity entity, URI uri, Parse parse) 
       throws DroidsException, IOException {
-    byte[] streamCopy = null;
-    if(stream==null){
-      return false;
-    }
-    ByteArrayOutputStream out = new ByteArrayOutputStream();
-    IOUtils.copy(stream, out);
-    streamCopy = out.toByteArray();
-    
     for (Handler handler : getMap().values()) {
-      if (streamCopy == null) {
-        return false;
+      InputStream instream = entity.obtainContent();
+      try {
+        handler.handle(instream, uri, parse);
+      } finally {
+        instream.close();
       }
-      handler.handle(new ByteArrayInputStream(streamCopy), uri, parse);
     }
     return true;
   }

Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/helper/factories/ProtocolFactory.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/helper/factories/ProtocolFactory.java?rev=713691&r1=713690&r2=713691&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/helper/factories/ProtocolFactory.java
(original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/helper/factories/ProtocolFactory.java
Thu Nov 13 01:47:55 2008
@@ -16,8 +16,7 @@
  */
 package org.apache.droids.helper.factories;
 
-import java.net.MalformedURLException;
-import java.net.URL;
+import java.net.URI;
 
 import org.apache.droids.api.Protocol;
 import org.apache.droids.exception.ProtocolNotFoundException;
@@ -38,13 +37,10 @@
    * @return ready to use protocol plugin or null if non have been found
    * @throws ProtocolNotFoundException
    */
-  public Protocol getProtocol(String uri) 
-      throws MalformedURLException, ProtocolNotFoundException {
-    URL url = null;
+  public Protocol getProtocol(URI uri) throws ProtocolNotFoundException {
     Protocol protocol = null;
     try {
-      url = new URL(uri);
-      String protocolName = url.getProtocol();
+      String protocolName = uri.getScheme();
       if (protocolName == null) {
         throw new ProtocolNotFoundException(uri);
       }

Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java?rev=713691&r1=713690&r2=713691&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java
(original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java
Thu Nov 13 01:47:55 2008
@@ -24,6 +24,7 @@
 import java.util.HashSet;
 import java.util.Map;
 
+import org.apache.droids.api.ContentEntity;
 import org.apache.droids.api.Link;
 import org.apache.droids.api.Parse;
 import org.apache.droids.api.Parser;
@@ -42,6 +43,7 @@
 import org.w3c.dom.NamedNodeMap;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
+import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
 import org.xml.sax.SAXNotRecognizedException;
 import org.xml.sax.SAXNotSupportedException;
@@ -69,7 +71,7 @@
 
   private Link link = null;
 
-  public Parse getParse(InputStream stream, Link newLink) throws DroidsException, IOException
{
+  public Parse getParse(ContentEntity entity, Link newLink) throws DroidsException, IOException
{
     this.link = newLink;
     this.base = newLink.getURI();
     ParseData parseData = null;
@@ -80,13 +82,16 @@
     final DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
     // parse document
     // XMLInputSource source = new XMLInputSource(null, uri, uri);
+    InputStream instream = entity.obtainContent();
     try {
-      parser.parse(base.toString(), node);
+      parser.parse(new InputSource(instream), node);
     } catch (SAXException ex) {
       throw new ContentFormatViolationException("Failure parsing HTML content", ex);
+    } finally {
+      instream.close();
     }
     parseData = extract(node);
-    return new ParseImpl(stream.toString(), parseData);
+    return new ParseImpl(newLink.getId(), parseData);
   }
 
   private ParseData extract(DocumentFragment node) throws InvalidLinkException {

Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/file/FileProtocol.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/file/FileProtocol.java?rev=713691&r1=713690&r2=713691&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/file/FileProtocol.java
(original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/file/FileProtocol.java
Thu Nov 13 01:47:55 2008
@@ -5,35 +5,71 @@
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.net.MalformedURLException;
+import java.net.URI;
 
+import org.apache.droids.api.ContentEntity;
 import org.apache.droids.api.Protocol;
 
 public class FileProtocol implements Protocol {
-  FileInputStream fileInputStream=null;
 
-  public String getContentType(String url) {
-    // FIXME: to be implemented
-    return null;
-  }
+  FileInputStream fileInputStream=null;
 
-  public boolean isAllowed(String url) throws MalformedURLException {
-    File file = new File(extractLocation(url));
+  public boolean isAllowed(URI uri) {
+    File file = new File(extractLocation(uri));
     return file.canRead();
   }
 
-  public InputStream openStream(String url) throws IOException {
-    url = extractLocation(url);
-    fileInputStream = new FileInputStream(url);
-    return new BufferedInputStream(fileInputStream);
+  public ContentEntity load(URI uri) throws IOException {
+    File file = new File(extractLocation(uri));
+    return new FileContentEntity(file);
   }
 
-  private String extractLocation(String url) {
-    final int start = url.indexOf("://");
+  private String extractLocation(URI uri) {
+    String location = uri.toString();
+    final int start = location.indexOf("://");
     if(start>-1){
-      url = url.substring(start+3);
+      location = location.substring(start+3);
     }
-    return url;
+    return location;
   }
 
+  static class FileContentEntity implements ContentEntity {
+    
+    private final File file;
+    private final String mimeType;
+    private final String charset;
+    
+    public FileContentEntity(File file) throws IOException {
+      super();
+      this.file = file;
+      String s = file.getName().toLowerCase();
+      if (s.endsWith(".html") || s.endsWith(".htm")) {
+        this.mimeType = "text/html";
+        this.charset = "ISO-8859-1";
+      } else if (s.endsWith(".txt")) {
+        this.mimeType = "text/plain";
+        this.charset = "ISO-8859-1";
+      } else {
+        this.mimeType = "binary/octet-stream";
+        this.charset = null;
+      }
+    }
+
+    public InputStream obtainContent() throws IOException {
+      return new BufferedInputStream(new FileInputStream(file));
+    }
+
+    public void finish() {
+    }
+
+    public String getMimeType() {
+      return mimeType;
+    }
+
+    public String getCharset() {
+      return charset;
+    }
+
+  }
+  
 }

Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/DroidsHttpClient.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/DroidsHttpClient.java?rev=713691&r1=713690&r2=713691&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/DroidsHttpClient.java
(original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/DroidsHttpClient.java
Thu Nov 13 01:47:55 2008
@@ -26,7 +26,10 @@
 import org.apache.http.client.HttpRequestRetryHandler;
 import org.apache.http.client.RedirectHandler;
 import org.apache.http.client.UserTokenHandler;
+import org.apache.http.client.params.AuthPolicy;
+import org.apache.http.client.protocol.ClientContext;
 import org.apache.http.client.protocol.RequestDefaultHeaders;
+import org.apache.http.client.protocol.RequestProxyAuthentication;
 import org.apache.http.conn.ClientConnectionManager;
 import org.apache.http.conn.ConnectionKeepAliveStrategy;
 import org.apache.http.conn.routing.HttpRoutePlanner;
@@ -36,10 +39,13 @@
 import org.apache.http.conn.ssl.SSLSocketFactory;
 import org.apache.http.cookie.CookieSpecRegistry;
 import org.apache.http.impl.DefaultConnectionReuseStrategy;
+import org.apache.http.impl.auth.BasicSchemeFactory;
+import org.apache.http.impl.auth.DigestSchemeFactory;
 import org.apache.http.impl.client.AbstractHttpClient;
 import org.apache.http.impl.client.BasicCookieStore;
 import org.apache.http.impl.client.BasicCredentialsProvider;
 import org.apache.http.impl.client.DefaultConnectionKeepAliveStrategy;
+import org.apache.http.impl.client.DefaultProxyAuthenticationHandler;
 import org.apache.http.impl.client.DefaultRedirectHandler;
 import org.apache.http.impl.client.DefaultUserTokenHandler;
 import org.apache.http.impl.conn.ProxySelectorRoutePlanner;
@@ -95,6 +101,8 @@
     httpproc.addInterceptor(new RequestConnControl());
     httpproc.addInterceptor(new RequestUserAgent());
     httpproc.addInterceptor(new RequestExpectContinue());
+    // HTTP authentication interceptors
+    httpproc.addInterceptor(new RequestProxyAuthentication());
     return httpproc;
   }
 
@@ -122,8 +130,14 @@
   @Override
   protected AuthSchemeRegistry createAuthSchemeRegistry()
   {
-    // Return empty auth scheme registry. There'll be no auth support
-    return new AuthSchemeRegistry();
+    AuthSchemeRegistry registry = new AuthSchemeRegistry(); 
+    registry.register(
+            AuthPolicy.BASIC, 
+            new BasicSchemeFactory());
+    registry.register(
+            AuthPolicy.DIGEST, 
+            new DigestSchemeFactory());
+    return registry;
   }
 
   @Override
@@ -149,6 +163,12 @@
   protected HttpContext createHttpContext()
   {
     HttpContext context = new BasicHttpContext();
+    context.setAttribute(
+            ClientContext.AUTHSCHEME_REGISTRY, 
+            getAuthSchemes());
+    context.setAttribute(
+            ClientContext.CREDS_PROVIDER, 
+            getCredentialsProvider());
     return context;
   }
 
@@ -167,7 +187,7 @@
   @Override
   protected AuthenticationHandler createProxyAuthenticationHandler()
   {
-    return new NoAuthHandler();
+    return new DefaultProxyAuthenticationHandler();
   }
 
   @Override

Added: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpClientContentLoader.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpClientContentLoader.java?rev=713691&view=auto
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpClientContentLoader.java
(added)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpClientContentLoader.java
Thu Nov 13 01:47:55 2008
@@ -0,0 +1,81 @@
+/*
+ * ====================================================================
+ *
+ *  Copyright 2005 The Apache Software Foundation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation.  For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ *
+ */
+
+package org.apache.droids.protocol.http;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URI;
+
+import org.apache.droids.norobots.ContentLoader;
+import org.apache.http.HttpEntity;
+import org.apache.http.HttpResponse;
+import org.apache.http.HttpStatus;
+import org.apache.http.StatusLine;
+import org.apache.http.client.HttpClient;
+import org.apache.http.client.HttpResponseException;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.client.methods.HttpHead;
+
+/**
+ * {@link ContentLoader} based on HttpClient 4.0.
+ */
+public class HttpClientContentLoader implements ContentLoader
+{
+
+  private final HttpClient httpclient;
+  
+  public HttpClientContentLoader(HttpClient httpclient) {
+    super();
+    this.httpclient = httpclient;
+  }
+  
+  public boolean exists(URI uri) throws IOException
+  {
+    HttpHead httphead = new HttpHead(uri);
+    HttpResponse response = httpclient.execute(httphead);
+    return response.getStatusLine().getStatusCode() == HttpStatus.SC_OK;
+  }
+
+  public InputStream load(URI uri) throws IOException {
+    HttpGet httpget = new HttpGet(uri);
+    HttpResponse response = httpclient.execute(httpget);
+    StatusLine statusline = response.getStatusLine();
+    if (statusline.getStatusCode() == HttpStatus.SC_NOT_FOUND) {
+      return null;
+    }
+    if (statusline.getStatusCode() != HttpStatus.SC_OK) {
+      throw new HttpResponseException(
+          statusline.getStatusCode(), statusline.getReasonPhrase());
+    }
+    HttpEntity entity = response.getEntity();
+    if (entity != null) {
+      return entity.getContent();
+    } else {
+      return null;
+    }
+  }
+
+}

Added: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpContentEntity.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpContentEntity.java?rev=713691&view=auto
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpContentEntity.java
(added)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpContentEntity.java
Thu Nov 13 01:47:55 2008
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.protocol.http;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Locale;
+
+import org.apache.droids.api.ContentEntity;
+import org.apache.http.Header;
+import org.apache.http.HeaderElement;
+import org.apache.http.HttpEntity;
+import org.apache.http.NameValuePair;
+import org.apache.http.entity.BufferedHttpEntity;
+import org.apache.http.protocol.HTTP;
+
+public class HttpContentEntity implements ContentEntity {
+
+  private final HttpEntity entity;
+  private final String mimeType;
+  private final String charset;
+  
+  public HttpContentEntity(HttpEntity entity) throws IOException {
+    super();
+    if (entity.isRepeatable()) {
+      this.entity = entity;
+    } else {
+      this.entity = new BufferedHttpEntity(entity);
+    }
+    
+    String mimeType = null;
+    String charset = null;
+    Header header = entity.getContentType();
+    if (header != null) {
+      HeaderElement[] helems = header.getElements();
+      if (helems != null && helems.length > 0) {
+        HeaderElement helem = helems[0];
+        mimeType = helem.getName();
+        NameValuePair nvp = helem.getParameterByName("charset");
+        if (nvp != null) {
+          charset = nvp.getValue();
+        }
+      }
+    }
+    if (mimeType != null) {
+      this.mimeType = mimeType.toLowerCase(Locale.ENGLISH);
+    } else {
+      this.mimeType = "binary/octet-stream";
+    }
+    if (charset != null) {
+      this.charset = charset;
+    } else {
+      if (this.mimeType.startsWith("text/")) {
+        this.charset = HTTP.ISO_8859_1;
+      } else {
+        this.charset = null;
+      }
+    }
+  }
+  
+  public String getMimeType() {
+    return mimeType;
+  }
+
+  public String getCharset() {
+    return charset;
+  }
+
+  public InputStream obtainContent() throws IOException {
+    return entity.getContent();
+  }
+
+  public void finish() {
+  }
+
+}

Added: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpProtocol.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpProtocol.java?rev=713691&view=auto
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpProtocol.java
(added)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpProtocol.java
Thu Nov 13 01:47:55 2008
@@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.protocol.http;
+
+import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
+
+import org.apache.droids.api.ContentEntity;
+import org.apache.droids.api.Protocol;
+import org.apache.droids.helper.Loggable;
+import org.apache.droids.norobots.ContentLoader;
+import org.apache.droids.norobots.NoRobotClient;
+import org.apache.droids.norobots.NoRobotException;
+import org.apache.http.HttpEntity;
+import org.apache.http.HttpResponse;
+import org.apache.http.HttpStatus;
+import org.apache.http.StatusLine;
+import org.apache.http.client.ClientProtocolException;
+import org.apache.http.client.HttpClient;
+import org.apache.http.client.HttpResponseException;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.impl.client.DefaultHttpClient;
+import org.apache.http.params.CoreProtocolPNames;
+
+/**
+ * Protocol handler based on HttpClient 4.0.
+ */
+public class HttpProtocol extends Loggable implements Protocol {
+  
+  private final HttpClient httpclient;
+  private final ContentLoader contentLoader;
+  
+  private boolean forceAllow = false;
+  private String userAgent = "Apache-Droids/1.1 (java 1.5)";
+
+  public HttpProtocol(final HttpClient httpclient) {
+    super();
+    this.httpclient = httpclient;
+    this.httpclient.getParams().setParameter(CoreProtocolPNames.USER_AGENT, userAgent);
+    this.contentLoader = new HttpClientContentLoader(httpclient);
+  }
+  
+  public HttpProtocol() {
+    this(new DefaultHttpClient());
+  }
+  
+  public ContentEntity load(URI uri) throws IOException {
+    HttpGet httpget = new HttpGet(uri);
+    HttpResponse response = httpclient.execute(httpget);
+    StatusLine statusline = response.getStatusLine();
+    if (statusline.getStatusCode() != HttpStatus.SC_OK) {
+      throw new HttpResponseException(
+          statusline.getStatusCode(), statusline.getReasonPhrase());
+    }
+    HttpEntity entity = response.getEntity();
+    if (entity == null) {
+      // Should _almost_ never happen with HTTP GET requests.
+      throw new ClientProtocolException("Empty entity");
+    }
+    return new HttpContentEntity(entity);
+  }
+
+  public boolean isAllowed(URI uri) {
+    if (forceAllow) {
+      return forceAllow;
+    }
+
+    String path = uri.getPath();
+    int i = path.lastIndexOf('/');
+    if (i != -1) {
+      path = path.substring(0, i);
+    } else {
+      path = path + "/";
+    }
+    
+    URI baseURI;
+    try {
+      baseURI = new URI(
+          uri.getScheme(), uri.getUserInfo(), uri.getHost(), uri.getPort(), 
+          path, null, null);
+    } catch (URISyntaxException ex) {
+      log.error("Unable to determine base URI for " + uri);
+      return false;
+    }
+    
+    NoRobotClient nrc = new NoRobotClient(contentLoader, userAgent);
+    try {
+      nrc.parse(baseURI);
+    } catch (NoRobotException ex) {
+      log.error("Failure parsing robots.txt: " + ex.getMessage());
+      return false;
+    } catch (IOException ex) {
+      log.error("I/O error parsing robots.txt: " + ex.getMessage());
+      return false;
+    }
+    boolean test = nrc.isUrlAllowed(uri);
+    String message = (test) ? "allowed" : "denied";
+    if (log.isInfoEnabled()) {
+      log.info("Url is " + message);
+    }
+    return test;
+  }
+
+  public String getUserAgent() {
+    return userAgent;
+  }
+
+  public void setUserAgent(String userAgent) {
+    this.userAgent = userAgent;
+    this.httpclient.getParams().setParameter(CoreProtocolPNames.USER_AGENT, userAgent);
+  }
+
+  /**
+   * You can force that a site is allowed (ignoring the robots.txt). This should
+   * only be used on server that you control and where you have the permission
+   * to ignore the robots.txt.
+   * 
+   * @return <code>true</code> if you are rude and ignore robots.txt.
+   *         <code>false</code> if you are playing nice.
+   */
+  public boolean isForceAllow() {
+    return forceAllow;
+  }
+
+  /**
+   * You can force that a site is allowed (ignoring the robot.txt). This should
+   * only be used on server that you control and where you have the permission
+   * to ignore the robots.txt.
+   * 
+   * @param forceAllow
+   *                if you want to force an allow and ignore the robot.txt set
+   *                to <code>true</code>. If you want to obey the rules and
+   *                be polite set to <code>false</code>.
+   */
+  public void setForceAllow(boolean forceAllow) {
+    this.forceAllow = forceAllow;
+  }
+
+}

Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java?rev=713691&r1=713690&r2=713691&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java
(original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java
Thu Nov 13 01:47:55 2008
@@ -17,11 +17,12 @@
 package org.apache.droids.robot.crawler;
 
 import java.io.IOException;
-import java.io.InputStream;
+import java.net.URI;
 import java.util.Collection;
 import java.util.LinkedHashMap;
 import java.util.Map;
 
+import org.apache.droids.api.ContentEntity;
 import org.apache.droids.api.Link;
 import org.apache.droids.api.Parse;
 import org.apache.droids.api.Parser;
@@ -42,21 +43,20 @@
 
   public void execute(Link link) throws DroidsException, IOException
   {
-    InputStream openStream = null;
     final String userAgent = this.getClass().getCanonicalName();
     if (log.isInfoEnabled()) {
       log.info("Starting " + userAgent);
     }
-    try {
-      String url = link.getId();
-      final Protocol protocol = droid.getProtocolFactory().getProtocol(url);
-      openStream = protocol.openStream(url);
-      if (protocol.isAllowed(url)) {
-        String contentType = protocol.getContentType(url);
+    URI uri = link.getURI();
+    final Protocol protocol = droid.getProtocolFactory().getProtocol(uri);
+    if (protocol.isAllowed(uri)) {
+      ContentEntity entity = protocol.load(uri);
+      try {
+        String contentType = entity.getMimeType();
         if (log.isInfoEnabled()) {
           log.info("contentType " + contentType);
         }
-        if (contentType==null){
+        if (contentType == null){
           log.info("missing contentType... can't parse...");
         }
         else {
@@ -67,35 +67,28 @@
             }
           }
           else {
-            Parse parse = parser.getParse(openStream, link);
+            Parse parse = parser.getParse(entity, link);
             if( parse.getData() != null ) {
               Collection<Link> outlinks = getFilteredOutlinks( parse );
               droid.getQueue().merge( outlinks );
             }
-            handle( parse, openStream, link );
+            handle( parse, entity, link );
           }
         }
-      } 
-      else {
-        log.info("Stopping processing since"
-            + " bots are not allowed for this url.");
+      } finally {
+        entity.finish();
       }
     } 
-    finally{
-      try {
-        if (openStream != null) {
-          openStream.close();
-        }
-      } catch (IOException ex) {
-        log.error("Error closing stream", ex);
-      }
+    else {
+      log.info("Stopping processing since"
+          + " bots are not allowed for this url.");
     }
   }
   
-  protected void handle( Parse parse, InputStream openStream, Link link ) 
+  protected void handle( Parse parse, ContentEntity entity, Link link ) 
       throws DroidsException, IOException
   {
-    droid.getHandlerFactory().handle(openStream, link.getURI(), parse);
+    droid.getHandlerFactory().handle(entity, link.getURI(), parse);
   }
   
   protected Collection<Link> getFilteredOutlinks( Parse parse )

Modified: incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/DroidsFactory.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/DroidsFactory.java?rev=713691&r1=713690&r2=713691&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/DroidsFactory.java
(original)
+++ incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/DroidsFactory.java
Thu Nov 13 01:47:55 2008
@@ -16,7 +16,7 @@
 import org.apache.droids.impl.SequentialTaskMaster;
 import org.apache.droids.impl.SimpleTaskQueue;
 import org.apache.droids.parse.html.HtmlParser;
-import org.apache.droids.protocol.http.Http;
+import org.apache.droids.protocol.http.HttpProtocol;
 import org.apache.droids.robot.crawler.CrawlingDroid;
 
 public class DroidsFactory
@@ -37,9 +37,8 @@
 
   public static ProtocolFactory createDefaultProtocolFactory() {
     ProtocolFactory protocolFactory = new ProtocolFactory();
-    Http httpProtocol = new Http();
+    HttpProtocol httpProtocol = new HttpProtocol();
     httpProtocol.setForceAllow(true);
-    httpProtocol.setUserAgent("Droids/1.1");
     
     protocolFactory.setMap(new HashMap<String, Object>());
     protocolFactory.getMap().put("http", httpProtocol);

Modified: incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/IndexerWorker.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/IndexerWorker.java?rev=713691&r1=713690&r2=713691&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/IndexerWorker.java
(original)
+++ incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/IndexerWorker.java
Thu Nov 13 01:47:55 2008
@@ -16,7 +16,7 @@
  */
 package org.apache.droids.examples;
 
-import java.io.InputStream;
+import org.apache.droids.api.ContentEntity;
 import org.apache.droids.api.Handler;
 import org.apache.droids.api.Link;
 import org.apache.droids.api.Parse;
@@ -34,20 +34,8 @@
 
 
   @Override
-  protected void handle( Parse parse, InputStream openStream, Link link )
+  protected void handle( Parse parse, ContentEntity entity, Link link )
   {
     // TODO -- something different...
-//    Handler handler = getHandlerFactory().resolve("solr");
-//    try {
-//      handler.handle(getProtocol().openStream(getUri()), new URL(getUri()), parse);
-//    } catch (Exception e) {
-//      SimpleThreads.threadMessage(e.getMessage());
-//    }
-//    try {
-//      getDroid().finishedWorker(super.getId());
-//    } catch (DroidsException e) {
-//      // TODO Auto-generated catch block
-//      e.printStackTrace();
-//    }
   }
 }

Modified: incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/SimpleRuntime.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/SimpleRuntime.java?rev=713691&r1=713690&r2=713691&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/SimpleRuntime.java
(original)
+++ incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/SimpleRuntime.java
Thu Nov 13 01:47:55 2008
@@ -34,7 +34,7 @@
 import org.apache.droids.impl.SimpleTaskQueue;
 import org.apache.droids.net.RegexURLFilter;
 import org.apache.droids.parse.html.HtmlParser;
-import org.apache.droids.protocol.http.Http;
+import org.apache.droids.protocol.http.HttpProtocol;
 
 /**
  * Simple Droids runtime that wires various components together in Java code 
@@ -67,8 +67,7 @@
 
     // Create protocol factory. Support HTTP only.
     ProtocolFactory protocolFactory = new ProtocolFactory();
-    Http httpProtocol = new Http();
-    httpProtocol.setForceAllow(true);
+    HttpProtocol httpProtocol = new HttpProtocol();
     protocolFactory.setMap(new HashMap<String, Object>());
     protocolFactory.getMap().put("http", httpProtocol);
     

Modified: incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/localserver/ResourceHandler.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/localserver/ResourceHandler.java?rev=713691&r1=713690&r2=713691&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/localserver/ResourceHandler.java
(original)
+++ incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/localserver/ResourceHandler.java
Thu Nov 13 01:47:55 2008
@@ -59,8 +59,10 @@
       InputStreamEntity entity = new InputStreamEntity(instream, -1);
       if (requestURI.endsWith("_html")) {
         entity.setContentType("text/html");
+        entity.setChunked(true);
       }
       response.setEntity(entity);
+      
     } else {
       response.setStatusCode(HttpStatus.SC_NOT_FOUND);
       StringEntity entity = new StringEntity(requestURI + " not found", "US-ASCII");



Mime
View raw message