From droids-commits-return-78-apmail-incubator-droids-commits-archive=incubator.apache.org@incubator.apache.org Thu Nov 13 09:48:58 2008 Return-Path: Delivered-To: apmail-incubator-droids-commits-archive@locus.apache.org Received: (qmail 60473 invoked from network); 13 Nov 2008 09:48:57 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.2) by minotaur.apache.org with SMTP; 13 Nov 2008 09:48:57 -0000 Received: (qmail 28232 invoked by uid 500); 13 Nov 2008 09:49:05 -0000 Delivered-To: apmail-incubator-droids-commits-archive@incubator.apache.org Received: (qmail 28209 invoked by uid 500); 13 Nov 2008 09:49:05 -0000 Mailing-List: contact droids-commits-help@incubator.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: droids-dev@incubator.apache.org Delivered-To: mailing list droids-commits@incubator.apache.org Received: (qmail 28198 invoked by uid 99); 13 Nov 2008 09:49:04 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 13 Nov 2008 01:49:04 -0800 X-ASF-Spam-Status: No, hits=-2000.0 required=10.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 13 Nov 2008 09:47:44 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id D4941238895F; Thu, 13 Nov 2008 01:47:57 -0800 (PST) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r713691 - in /incubator/droids/trunk/droids-core: ./ src/main/java/org/apache/droids/api/ src/main/java/org/apache/droids/exception/ src/main/java/org/apache/droids/helper/factories/ src/main/java/org/apache/droids/net/ src/main/java/org/ap... Date: Thu, 13 Nov 2008 09:47:56 -0000 To: droids-commits@incubator.apache.org From: olegk@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20081113094757.D4941238895F@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: olegk Date: Thu Nov 13 01:47:55 2008 New Revision: 713691 URL: http://svn.apache.org/viewvc?rev=713691&view=rev Log: API changes: * Added ContentEntity interface representing a body of content retrieved from a URI * Replaced default HTTP protocol implementation based on URLConnection with one based on HttpClient 4.0 Added: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/ContentEntity.java incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpClientContentLoader.java incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpContentEntity.java incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpProtocol.java Removed: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/net/UrlHelper.java incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/HttpBase.java incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/MediaType.java incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/Http.java Modified: incubator/droids/trunk/droids-core/pom.xml incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Protocol.java incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/exception/ProtocolNotFoundException.java incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/helper/factories/HandlerFactory.java incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/helper/factories/ProtocolFactory.java incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/file/FileProtocol.java incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/DroidsHttpClient.java incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/DroidsFactory.java incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/IndexerWorker.java incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/SimpleRuntime.java incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/localserver/ResourceHandler.java Modified: incubator/droids/trunk/droids-core/pom.xml URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/pom.xml?rev=713691&r1=713690&r2=713691&view=diff ============================================================================== --- incubator/droids/trunk/droids-core/pom.xml (original) +++ incubator/droids/trunk/droids-core/pom.xml Thu Nov 13 01:47:55 2008 @@ -52,10 +52,6 @@ commons-logging - commons-io - commons-io - - org.apache.geronimo.specs geronimo-stax-api_1.0_spec Added: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/ContentEntity.java URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/ContentEntity.java?rev=713691&view=auto ============================================================================== --- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/ContentEntity.java (added) +++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/ContentEntity.java Thu Nov 13 01:47:55 2008 @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.droids.api; + +import java.io.IOException; +import java.io.InputStream; + +/** + * Abstract interface representing a body of content with a particular + * MIME type and an optional charset. + *

+ * IMPORTANT: implementations of this interface MUST ensure that the content + * is repeatable, that is, the content can be consumed more than once. + *

+ * IMPORTANT: The consumer of the entity content MUST close the input stream + * returned by {@link #obtainContent()} when finished reading the content. + * The consumer MUST call {@link #finish()} when the entity is no longer + * needed in order to release underlying resources held by the entity. + * + * @version 1.0 + */ +public interface ContentEntity { + + /** + * Returns content of the entity as an input stream. This input stream + * MUST be closed by the consumer when finished reading content. + *

+ * IMPORTANT: This method MUST return a new instance of {@link InputStream} + * to ensure the content can be consumed miore than once. + * + * @return input stream + * @throws IOException + */ + InputStream obtainContent() throws IOException; + + /** + * Returns MIME type of the entity. + * + * @return MIME type + */ + String getMimeType(); + + /** + * Returns charset of the entity if known. Otherwise returns + * null. + * + * @return charset + */ + String getCharset(); + + /** + * Release all underlying resources held by the entity. + */ + void finish(); + +} \ No newline at end of file Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java?rev=713691&r1=713690&r2=713691&view=diff ============================================================================== --- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java (original) +++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java Thu Nov 13 01:47:55 2008 @@ -17,7 +17,6 @@ package org.apache.droids.api; import java.io.IOException; -import java.io.InputStream; import org.apache.droids.exception.DroidsException; @@ -38,5 +37,5 @@ * the link that correspond to the stream * @return the parse object */ - Parse getParse(InputStream openStream, Link link) throws DroidsException, IOException; + Parse getParse(ContentEntity entity, Link link) throws DroidsException, IOException; } Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Protocol.java URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Protocol.java?rev=713691&r1=713690&r2=713691&view=diff ============================================================================== --- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Protocol.java (original) +++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Protocol.java Thu Nov 13 01:47:55 2008 @@ -17,8 +17,8 @@ package org.apache.droids.api; import java.io.IOException; -import java.io.InputStream; import java.net.MalformedURLException; +import java.net.URI; /** * The protocol interface is a wrapper to hide the underlying implementation of @@ -37,24 +37,16 @@ * @return true if we can request the url. false if we are forbidden. * @throws MalformedURLException */ - boolean isAllowed(String url) throws MalformedURLException; + boolean isAllowed(URI url) throws IOException; /** - * Return the stream represent of the url + * Return the content entity represent of the url * * @param url * url of the stream we want to open - * @return the stream of the given url + * @return the content of the given url * @throws IOException */ - InputStream openStream(String url) throws IOException; + ContentEntity load(URI uri) throws IOException; - /** - * Returns the content type of the url - * - * @param url - * url to evaluate - * @return registered content type - */ - String getContentType(String url); } \ No newline at end of file Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/exception/ProtocolNotFoundException.java URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/exception/ProtocolNotFoundException.java?rev=713691&r1=713690&r2=713691&view=diff ============================================================================== --- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/exception/ProtocolNotFoundException.java (original) +++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/exception/ProtocolNotFoundException.java Thu Nov 13 01:47:55 2008 @@ -16,6 +16,8 @@ */ package org.apache.droids.exception; +import java.net.URI; + /** * If we do not have any instance of a protocol registered for the iven url. * @@ -24,7 +26,7 @@ */ public class ProtocolNotFoundException extends DroidsException { private static final long serialVersionUID = 6980937469875896426L; - private String url = null; + private URI uri = null; /** * Create an exception for the given url @@ -32,8 +34,8 @@ * @param url * url where we do not have a suitable protocol */ - public ProtocolNotFoundException(String url) { - this(url, "protocol not found for url=" + url); + public ProtocolNotFoundException(URI uri) { + this(uri, "protocol not found for uri=" + uri); } /** @@ -44,9 +46,9 @@ * @param message * detailed message to explain the underlying cause */ - public ProtocolNotFoundException(String url, String message) { + public ProtocolNotFoundException(URI uri, String message) { super(message); - this.url = url; + this.uri = uri; } /** @@ -54,7 +56,7 @@ * * @return url which has caused the problem */ - public String getUrl() { - return url; + public URI getUri() { + return uri; } } Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/helper/factories/HandlerFactory.java URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/helper/factories/HandlerFactory.java?rev=713691&r1=713690&r2=713691&view=diff ============================================================================== --- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/helper/factories/HandlerFactory.java (original) +++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/helper/factories/HandlerFactory.java Thu Nov 13 01:47:55 2008 @@ -16,13 +16,11 @@ */ package org.apache.droids.helper.factories; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.net.URI; -import org.apache.commons.io.IOUtils; +import org.apache.droids.api.ContentEntity; import org.apache.droids.api.Handler; import org.apache.droids.api.Parse; import org.apache.droids.exception.DroidsException; @@ -47,21 +45,15 @@ * the underlying parse object * @return false if we found a problem, true if all went well */ - public boolean handle(InputStream stream, URI uri, Parse parse) + public boolean handle(ContentEntity entity, URI uri, Parse parse) throws DroidsException, IOException { - byte[] streamCopy = null; - if(stream==null){ - return false; - } - ByteArrayOutputStream out = new ByteArrayOutputStream(); - IOUtils.copy(stream, out); - streamCopy = out.toByteArray(); - for (Handler handler : getMap().values()) { - if (streamCopy == null) { - return false; + InputStream instream = entity.obtainContent(); + try { + handler.handle(instream, uri, parse); + } finally { + instream.close(); } - handler.handle(new ByteArrayInputStream(streamCopy), uri, parse); } return true; } Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/helper/factories/ProtocolFactory.java URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/helper/factories/ProtocolFactory.java?rev=713691&r1=713690&r2=713691&view=diff ============================================================================== --- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/helper/factories/ProtocolFactory.java (original) +++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/helper/factories/ProtocolFactory.java Thu Nov 13 01:47:55 2008 @@ -16,8 +16,7 @@ */ package org.apache.droids.helper.factories; -import java.net.MalformedURLException; -import java.net.URL; +import java.net.URI; import org.apache.droids.api.Protocol; import org.apache.droids.exception.ProtocolNotFoundException; @@ -38,13 +37,10 @@ * @return ready to use protocol plugin or null if non have been found * @throws ProtocolNotFoundException */ - public Protocol getProtocol(String uri) - throws MalformedURLException, ProtocolNotFoundException { - URL url = null; + public Protocol getProtocol(URI uri) throws ProtocolNotFoundException { Protocol protocol = null; try { - url = new URL(uri); - String protocolName = url.getProtocol(); + String protocolName = uri.getScheme(); if (protocolName == null) { throw new ProtocolNotFoundException(uri); } Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java?rev=713691&r1=713690&r2=713691&view=diff ============================================================================== --- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java (original) +++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java Thu Nov 13 01:47:55 2008 @@ -24,6 +24,7 @@ import java.util.HashSet; import java.util.Map; +import org.apache.droids.api.ContentEntity; import org.apache.droids.api.Link; import org.apache.droids.api.Parse; import org.apache.droids.api.Parser; @@ -42,6 +43,7 @@ import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; +import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.SAXNotRecognizedException; import org.xml.sax.SAXNotSupportedException; @@ -69,7 +71,7 @@ private Link link = null; - public Parse getParse(InputStream stream, Link newLink) throws DroidsException, IOException { + public Parse getParse(ContentEntity entity, Link newLink) throws DroidsException, IOException { this.link = newLink; this.base = newLink.getURI(); ParseData parseData = null; @@ -80,13 +82,16 @@ final DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment(); // parse document // XMLInputSource source = new XMLInputSource(null, uri, uri); + InputStream instream = entity.obtainContent(); try { - parser.parse(base.toString(), node); + parser.parse(new InputSource(instream), node); } catch (SAXException ex) { throw new ContentFormatViolationException("Failure parsing HTML content", ex); + } finally { + instream.close(); } parseData = extract(node); - return new ParseImpl(stream.toString(), parseData); + return new ParseImpl(newLink.getId(), parseData); } private ParseData extract(DocumentFragment node) throws InvalidLinkException { Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/file/FileProtocol.java URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/file/FileProtocol.java?rev=713691&r1=713690&r2=713691&view=diff ============================================================================== --- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/file/FileProtocol.java (original) +++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/file/FileProtocol.java Thu Nov 13 01:47:55 2008 @@ -5,35 +5,71 @@ import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; -import java.net.MalformedURLException; +import java.net.URI; +import org.apache.droids.api.ContentEntity; import org.apache.droids.api.Protocol; public class FileProtocol implements Protocol { - FileInputStream fileInputStream=null; - public String getContentType(String url) { - // FIXME: to be implemented - return null; - } + FileInputStream fileInputStream=null; - public boolean isAllowed(String url) throws MalformedURLException { - File file = new File(extractLocation(url)); + public boolean isAllowed(URI uri) { + File file = new File(extractLocation(uri)); return file.canRead(); } - public InputStream openStream(String url) throws IOException { - url = extractLocation(url); - fileInputStream = new FileInputStream(url); - return new BufferedInputStream(fileInputStream); + public ContentEntity load(URI uri) throws IOException { + File file = new File(extractLocation(uri)); + return new FileContentEntity(file); } - private String extractLocation(String url) { - final int start = url.indexOf("://"); + private String extractLocation(URI uri) { + String location = uri.toString(); + final int start = location.indexOf("://"); if(start>-1){ - url = url.substring(start+3); + location = location.substring(start+3); } - return url; + return location; } + static class FileContentEntity implements ContentEntity { + + private final File file; + private final String mimeType; + private final String charset; + + public FileContentEntity(File file) throws IOException { + super(); + this.file = file; + String s = file.getName().toLowerCase(); + if (s.endsWith(".html") || s.endsWith(".htm")) { + this.mimeType = "text/html"; + this.charset = "ISO-8859-1"; + } else if (s.endsWith(".txt")) { + this.mimeType = "text/plain"; + this.charset = "ISO-8859-1"; + } else { + this.mimeType = "binary/octet-stream"; + this.charset = null; + } + } + + public InputStream obtainContent() throws IOException { + return new BufferedInputStream(new FileInputStream(file)); + } + + public void finish() { + } + + public String getMimeType() { + return mimeType; + } + + public String getCharset() { + return charset; + } + + } + } Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/DroidsHttpClient.java URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/DroidsHttpClient.java?rev=713691&r1=713690&r2=713691&view=diff ============================================================================== --- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/DroidsHttpClient.java (original) +++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/DroidsHttpClient.java Thu Nov 13 01:47:55 2008 @@ -26,7 +26,10 @@ import org.apache.http.client.HttpRequestRetryHandler; import org.apache.http.client.RedirectHandler; import org.apache.http.client.UserTokenHandler; +import org.apache.http.client.params.AuthPolicy; +import org.apache.http.client.protocol.ClientContext; import org.apache.http.client.protocol.RequestDefaultHeaders; +import org.apache.http.client.protocol.RequestProxyAuthentication; import org.apache.http.conn.ClientConnectionManager; import org.apache.http.conn.ConnectionKeepAliveStrategy; import org.apache.http.conn.routing.HttpRoutePlanner; @@ -36,10 +39,13 @@ import org.apache.http.conn.ssl.SSLSocketFactory; import org.apache.http.cookie.CookieSpecRegistry; import org.apache.http.impl.DefaultConnectionReuseStrategy; +import org.apache.http.impl.auth.BasicSchemeFactory; +import org.apache.http.impl.auth.DigestSchemeFactory; import org.apache.http.impl.client.AbstractHttpClient; import org.apache.http.impl.client.BasicCookieStore; import org.apache.http.impl.client.BasicCredentialsProvider; import org.apache.http.impl.client.DefaultConnectionKeepAliveStrategy; +import org.apache.http.impl.client.DefaultProxyAuthenticationHandler; import org.apache.http.impl.client.DefaultRedirectHandler; import org.apache.http.impl.client.DefaultUserTokenHandler; import org.apache.http.impl.conn.ProxySelectorRoutePlanner; @@ -95,6 +101,8 @@ httpproc.addInterceptor(new RequestConnControl()); httpproc.addInterceptor(new RequestUserAgent()); httpproc.addInterceptor(new RequestExpectContinue()); + // HTTP authentication interceptors + httpproc.addInterceptor(new RequestProxyAuthentication()); return httpproc; } @@ -122,8 +130,14 @@ @Override protected AuthSchemeRegistry createAuthSchemeRegistry() { - // Return empty auth scheme registry. There'll be no auth support - return new AuthSchemeRegistry(); + AuthSchemeRegistry registry = new AuthSchemeRegistry(); + registry.register( + AuthPolicy.BASIC, + new BasicSchemeFactory()); + registry.register( + AuthPolicy.DIGEST, + new DigestSchemeFactory()); + return registry; } @Override @@ -149,6 +163,12 @@ protected HttpContext createHttpContext() { HttpContext context = new BasicHttpContext(); + context.setAttribute( + ClientContext.AUTHSCHEME_REGISTRY, + getAuthSchemes()); + context.setAttribute( + ClientContext.CREDS_PROVIDER, + getCredentialsProvider()); return context; } @@ -167,7 +187,7 @@ @Override protected AuthenticationHandler createProxyAuthenticationHandler() { - return new NoAuthHandler(); + return new DefaultProxyAuthenticationHandler(); } @Override Added: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpClientContentLoader.java URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpClientContentLoader.java?rev=713691&view=auto ============================================================================== --- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpClientContentLoader.java (added) +++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpClientContentLoader.java Thu Nov 13 01:47:55 2008 @@ -0,0 +1,81 @@ +/* + * ==================================================================== + * + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . + * + */ + +package org.apache.droids.protocol.http; + +import java.io.IOException; +import java.io.InputStream; +import java.net.URI; + +import org.apache.droids.norobots.ContentLoader; +import org.apache.http.HttpEntity; +import org.apache.http.HttpResponse; +import org.apache.http.HttpStatus; +import org.apache.http.StatusLine; +import org.apache.http.client.HttpClient; +import org.apache.http.client.HttpResponseException; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.HttpHead; + +/** + * {@link ContentLoader} based on HttpClient 4.0. + */ +public class HttpClientContentLoader implements ContentLoader +{ + + private final HttpClient httpclient; + + public HttpClientContentLoader(HttpClient httpclient) { + super(); + this.httpclient = httpclient; + } + + public boolean exists(URI uri) throws IOException + { + HttpHead httphead = new HttpHead(uri); + HttpResponse response = httpclient.execute(httphead); + return response.getStatusLine().getStatusCode() == HttpStatus.SC_OK; + } + + public InputStream load(URI uri) throws IOException { + HttpGet httpget = new HttpGet(uri); + HttpResponse response = httpclient.execute(httpget); + StatusLine statusline = response.getStatusLine(); + if (statusline.getStatusCode() == HttpStatus.SC_NOT_FOUND) { + return null; + } + if (statusline.getStatusCode() != HttpStatus.SC_OK) { + throw new HttpResponseException( + statusline.getStatusCode(), statusline.getReasonPhrase()); + } + HttpEntity entity = response.getEntity(); + if (entity != null) { + return entity.getContent(); + } else { + return null; + } + } + +} Added: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpContentEntity.java URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpContentEntity.java?rev=713691&view=auto ============================================================================== --- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpContentEntity.java (added) +++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpContentEntity.java Thu Nov 13 01:47:55 2008 @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.droids.protocol.http; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Locale; + +import org.apache.droids.api.ContentEntity; +import org.apache.http.Header; +import org.apache.http.HeaderElement; +import org.apache.http.HttpEntity; +import org.apache.http.NameValuePair; +import org.apache.http.entity.BufferedHttpEntity; +import org.apache.http.protocol.HTTP; + +public class HttpContentEntity implements ContentEntity { + + private final HttpEntity entity; + private final String mimeType; + private final String charset; + + public HttpContentEntity(HttpEntity entity) throws IOException { + super(); + if (entity.isRepeatable()) { + this.entity = entity; + } else { + this.entity = new BufferedHttpEntity(entity); + } + + String mimeType = null; + String charset = null; + Header header = entity.getContentType(); + if (header != null) { + HeaderElement[] helems = header.getElements(); + if (helems != null && helems.length > 0) { + HeaderElement helem = helems[0]; + mimeType = helem.getName(); + NameValuePair nvp = helem.getParameterByName("charset"); + if (nvp != null) { + charset = nvp.getValue(); + } + } + } + if (mimeType != null) { + this.mimeType = mimeType.toLowerCase(Locale.ENGLISH); + } else { + this.mimeType = "binary/octet-stream"; + } + if (charset != null) { + this.charset = charset; + } else { + if (this.mimeType.startsWith("text/")) { + this.charset = HTTP.ISO_8859_1; + } else { + this.charset = null; + } + } + } + + public String getMimeType() { + return mimeType; + } + + public String getCharset() { + return charset; + } + + public InputStream obtainContent() throws IOException { + return entity.getContent(); + } + + public void finish() { + } + +} Added: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpProtocol.java URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpProtocol.java?rev=713691&view=auto ============================================================================== --- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpProtocol.java (added) +++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpProtocol.java Thu Nov 13 01:47:55 2008 @@ -0,0 +1,154 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.droids.protocol.http; + +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; + +import org.apache.droids.api.ContentEntity; +import org.apache.droids.api.Protocol; +import org.apache.droids.helper.Loggable; +import org.apache.droids.norobots.ContentLoader; +import org.apache.droids.norobots.NoRobotClient; +import org.apache.droids.norobots.NoRobotException; +import org.apache.http.HttpEntity; +import org.apache.http.HttpResponse; +import org.apache.http.HttpStatus; +import org.apache.http.StatusLine; +import org.apache.http.client.ClientProtocolException; +import org.apache.http.client.HttpClient; +import org.apache.http.client.HttpResponseException; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.DefaultHttpClient; +import org.apache.http.params.CoreProtocolPNames; + +/** + * Protocol handler based on HttpClient 4.0. + */ +public class HttpProtocol extends Loggable implements Protocol { + + private final HttpClient httpclient; + private final ContentLoader contentLoader; + + private boolean forceAllow = false; + private String userAgent = "Apache-Droids/1.1 (java 1.5)"; + + public HttpProtocol(final HttpClient httpclient) { + super(); + this.httpclient = httpclient; + this.httpclient.getParams().setParameter(CoreProtocolPNames.USER_AGENT, userAgent); + this.contentLoader = new HttpClientContentLoader(httpclient); + } + + public HttpProtocol() { + this(new DefaultHttpClient()); + } + + public ContentEntity load(URI uri) throws IOException { + HttpGet httpget = new HttpGet(uri); + HttpResponse response = httpclient.execute(httpget); + StatusLine statusline = response.getStatusLine(); + if (statusline.getStatusCode() != HttpStatus.SC_OK) { + throw new HttpResponseException( + statusline.getStatusCode(), statusline.getReasonPhrase()); + } + HttpEntity entity = response.getEntity(); + if (entity == null) { + // Should _almost_ never happen with HTTP GET requests. + throw new ClientProtocolException("Empty entity"); + } + return new HttpContentEntity(entity); + } + + public boolean isAllowed(URI uri) { + if (forceAllow) { + return forceAllow; + } + + String path = uri.getPath(); + int i = path.lastIndexOf('/'); + if (i != -1) { + path = path.substring(0, i); + } else { + path = path + "/"; + } + + URI baseURI; + try { + baseURI = new URI( + uri.getScheme(), uri.getUserInfo(), uri.getHost(), uri.getPort(), + path, null, null); + } catch (URISyntaxException ex) { + log.error("Unable to determine base URI for " + uri); + return false; + } + + NoRobotClient nrc = new NoRobotClient(contentLoader, userAgent); + try { + nrc.parse(baseURI); + } catch (NoRobotException ex) { + log.error("Failure parsing robots.txt: " + ex.getMessage()); + return false; + } catch (IOException ex) { + log.error("I/O error parsing robots.txt: " + ex.getMessage()); + return false; + } + boolean test = nrc.isUrlAllowed(uri); + String message = (test) ? "allowed" : "denied"; + if (log.isInfoEnabled()) { + log.info("Url is " + message); + } + return test; + } + + public String getUserAgent() { + return userAgent; + } + + public void setUserAgent(String userAgent) { + this.userAgent = userAgent; + this.httpclient.getParams().setParameter(CoreProtocolPNames.USER_AGENT, userAgent); + } + + /** + * You can force that a site is allowed (ignoring the robots.txt). This should + * only be used on server that you control and where you have the permission + * to ignore the robots.txt. + * + * @return true if you are rude and ignore robots.txt. + * false if you are playing nice. + */ + public boolean isForceAllow() { + return forceAllow; + } + + /** + * You can force that a site is allowed (ignoring the robot.txt). This should + * only be used on server that you control and where you have the permission + * to ignore the robots.txt. + * + * @param forceAllow + * if you want to force an allow and ignore the robot.txt set + * to true. If you want to obey the rules and + * be polite set to false. + */ + public void setForceAllow(boolean forceAllow) { + this.forceAllow = forceAllow; + } + +} Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java?rev=713691&r1=713690&r2=713691&view=diff ============================================================================== --- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java (original) +++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java Thu Nov 13 01:47:55 2008 @@ -17,11 +17,12 @@ package org.apache.droids.robot.crawler; import java.io.IOException; -import java.io.InputStream; +import java.net.URI; import java.util.Collection; import java.util.LinkedHashMap; import java.util.Map; +import org.apache.droids.api.ContentEntity; import org.apache.droids.api.Link; import org.apache.droids.api.Parse; import org.apache.droids.api.Parser; @@ -42,21 +43,20 @@ public void execute(Link link) throws DroidsException, IOException { - InputStream openStream = null; final String userAgent = this.getClass().getCanonicalName(); if (log.isInfoEnabled()) { log.info("Starting " + userAgent); } - try { - String url = link.getId(); - final Protocol protocol = droid.getProtocolFactory().getProtocol(url); - openStream = protocol.openStream(url); - if (protocol.isAllowed(url)) { - String contentType = protocol.getContentType(url); + URI uri = link.getURI(); + final Protocol protocol = droid.getProtocolFactory().getProtocol(uri); + if (protocol.isAllowed(uri)) { + ContentEntity entity = protocol.load(uri); + try { + String contentType = entity.getMimeType(); if (log.isInfoEnabled()) { log.info("contentType " + contentType); } - if (contentType==null){ + if (contentType == null){ log.info("missing contentType... can't parse..."); } else { @@ -67,35 +67,28 @@ } } else { - Parse parse = parser.getParse(openStream, link); + Parse parse = parser.getParse(entity, link); if( parse.getData() != null ) { Collection outlinks = getFilteredOutlinks( parse ); droid.getQueue().merge( outlinks ); } - handle( parse, openStream, link ); + handle( parse, entity, link ); } } - } - else { - log.info("Stopping processing since" - + " bots are not allowed for this url."); + } finally { + entity.finish(); } } - finally{ - try { - if (openStream != null) { - openStream.close(); - } - } catch (IOException ex) { - log.error("Error closing stream", ex); - } + else { + log.info("Stopping processing since" + + " bots are not allowed for this url."); } } - protected void handle( Parse parse, InputStream openStream, Link link ) + protected void handle( Parse parse, ContentEntity entity, Link link ) throws DroidsException, IOException { - droid.getHandlerFactory().handle(openStream, link.getURI(), parse); + droid.getHandlerFactory().handle(entity, link.getURI(), parse); } protected Collection getFilteredOutlinks( Parse parse ) Modified: incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/DroidsFactory.java URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/DroidsFactory.java?rev=713691&r1=713690&r2=713691&view=diff ============================================================================== --- incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/DroidsFactory.java (original) +++ incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/DroidsFactory.java Thu Nov 13 01:47:55 2008 @@ -16,7 +16,7 @@ import org.apache.droids.impl.SequentialTaskMaster; import org.apache.droids.impl.SimpleTaskQueue; import org.apache.droids.parse.html.HtmlParser; -import org.apache.droids.protocol.http.Http; +import org.apache.droids.protocol.http.HttpProtocol; import org.apache.droids.robot.crawler.CrawlingDroid; public class DroidsFactory @@ -37,9 +37,8 @@ public static ProtocolFactory createDefaultProtocolFactory() { ProtocolFactory protocolFactory = new ProtocolFactory(); - Http httpProtocol = new Http(); + HttpProtocol httpProtocol = new HttpProtocol(); httpProtocol.setForceAllow(true); - httpProtocol.setUserAgent("Droids/1.1"); protocolFactory.setMap(new HashMap()); protocolFactory.getMap().put("http", httpProtocol); Modified: incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/IndexerWorker.java URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/IndexerWorker.java?rev=713691&r1=713690&r2=713691&view=diff ============================================================================== --- incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/IndexerWorker.java (original) +++ incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/IndexerWorker.java Thu Nov 13 01:47:55 2008 @@ -16,7 +16,7 @@ */ package org.apache.droids.examples; -import java.io.InputStream; +import org.apache.droids.api.ContentEntity; import org.apache.droids.api.Handler; import org.apache.droids.api.Link; import org.apache.droids.api.Parse; @@ -34,20 +34,8 @@ @Override - protected void handle( Parse parse, InputStream openStream, Link link ) + protected void handle( Parse parse, ContentEntity entity, Link link ) { // TODO -- something different... -// Handler handler = getHandlerFactory().resolve("solr"); -// try { -// handler.handle(getProtocol().openStream(getUri()), new URL(getUri()), parse); -// } catch (Exception e) { -// SimpleThreads.threadMessage(e.getMessage()); -// } -// try { -// getDroid().finishedWorker(super.getId()); -// } catch (DroidsException e) { -// // TODO Auto-generated catch block -// e.printStackTrace(); -// } } } Modified: incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/SimpleRuntime.java URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/SimpleRuntime.java?rev=713691&r1=713690&r2=713691&view=diff ============================================================================== --- incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/SimpleRuntime.java (original) +++ incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/examples/SimpleRuntime.java Thu Nov 13 01:47:55 2008 @@ -34,7 +34,7 @@ import org.apache.droids.impl.SimpleTaskQueue; import org.apache.droids.net.RegexURLFilter; import org.apache.droids.parse.html.HtmlParser; -import org.apache.droids.protocol.http.Http; +import org.apache.droids.protocol.http.HttpProtocol; /** * Simple Droids runtime that wires various components together in Java code @@ -67,8 +67,7 @@ // Create protocol factory. Support HTTP only. ProtocolFactory protocolFactory = new ProtocolFactory(); - Http httpProtocol = new Http(); - httpProtocol.setForceAllow(true); + HttpProtocol httpProtocol = new HttpProtocol(); protocolFactory.setMap(new HashMap()); protocolFactory.getMap().put("http", httpProtocol); Modified: incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/localserver/ResourceHandler.java URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/localserver/ResourceHandler.java?rev=713691&r1=713690&r2=713691&view=diff ============================================================================== --- incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/localserver/ResourceHandler.java (original) +++ incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/localserver/ResourceHandler.java Thu Nov 13 01:47:55 2008 @@ -59,8 +59,10 @@ InputStreamEntity entity = new InputStreamEntity(instream, -1); if (requestURI.endsWith("_html")) { entity.setContentType("text/html"); + entity.setChunked(true); } response.setEntity(entity); + } else { response.setStatusCode(HttpStatus.SC_NOT_FOUND); StringEntity entity = new StringEntity(requestURI + " not found", "US-ASCII");