Return-Path: Delivered-To: apmail-jakarta-lucene-dev-archive@apache.org Received: (qmail 92528 invoked from network); 17 Jun 2002 13:59:35 -0000 Received: from unknown (HELO nagoya.betaversion.org) (192.18.49.131) by daedalus.apache.org with SMTP; 17 Jun 2002 13:59:35 -0000 Received: (qmail 18710 invoked by uid 97); 17 Jun 2002 13:59:39 -0000 Delivered-To: qmlist-jakarta-archive-lucene-dev@jakarta.apache.org Received: (qmail 18689 invoked by uid 97); 17 Jun 2002 13:59:39 -0000 Mailing-List: contact lucene-dev-help@jakarta.apache.org; run by ezmlm Precedence: bulk List-Unsubscribe: List-Subscribe: List-Help: List-Post: List-Id: "Lucene Developers List" Reply-To: "Lucene Developers List" Delivered-To: mailing list lucene-dev@jakarta.apache.org Received: (qmail 18678 invoked by uid 97); 17 Jun 2002 13:59:38 -0000 X-Antivirus: nagoya (v4198 created Apr 24 2002) Date: 17 Jun 2002 13:59:29 -0000 Message-ID: <20020617135929.77355.qmail@icarus.apache.org> From: cmarschner@apache.org To: jakarta-lucene-sandbox-cvs@apache.org Subject: cvs commit: jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher Fetcher.java FetcherMain.java FetcherTaskQueue.java FetcherThread.java FetcherThreadFactory.java RobotExclusionFilter.java ThreadMonitor.java URLMessage.java URLVisitedFilter.java HostInfo.java HostManager.java X-Spam-Rating: daedalus.apache.org 1.6.2 0/1000/N X-Spam-Rating: daedalus.apache.org 1.6.2 0/1000/N cmarschner 2002/06/17 06:59:29 Modified: contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher Fetcher.java FetcherMain.java FetcherTaskQueue.java FetcherThread.java FetcherThreadFactory.java RobotExclusionFilter.java ThreadMonitor.java URLMessage.java URLVisitedFilter.java Removed: contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher HostInfo.java HostManager.java Log: added URLNormalizer. Changed filters to use normalized URLs if possible Revision Changes Path 1.4 +2 -1 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Fetcher.java Index: Fetcher.java =================================================================== RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Fetcher.java,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- Fetcher.java 1 Jun 2002 18:55:15 -0000 1.3 +++ Fetcher.java 17 Jun 2002 13:59:28 -0000 1.4 @@ -65,6 +65,7 @@ import java.util.LinkedList; import de.lanlab.larm.fetcher.FetcherTask; +import de.lanlab.larm.net.*; /** * filter class; the Fetcher is the main class which keeps the ThreadPool that 1.4 +3 -2 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java Index: FetcherMain.java =================================================================== RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- FetcherMain.java 1 Jun 2002 18:55:15 -0000 1.3 +++ FetcherMain.java 17 Jun 2002 13:59:28 -0000 1.4 @@ -62,6 +62,7 @@ import de.lanlab.larm.gui.*; import de.lanlab.larm.util.*; import de.lanlab.larm.storage.*; +import de.lanlab.larm.net.*; import javax.swing.UIManager; import HTTPClient.*; import org.apache.oro.text.regex.MalformedPatternException; @@ -278,7 +279,7 @@ { try { - messageHandler.putMessage(new URLMessage(url, null, isFrame, null)); + messageHandler.putMessage(new URLMessage(url, null, isFrame, null, this.hostManager)); } catch (Exception e) { 1.3 +16 -15 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTaskQueue.java Index: FetcherTaskQueue.java =================================================================== RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTaskQueue.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- FetcherTaskQueue.java 22 May 2002 23:09:17 -0000 1.2 +++ FetcherTaskQueue.java 17 Jun 2002 13:59:28 -0000 1.3 @@ -186,16 +186,17 @@ public static void main(String args[]) { FetcherTaskQueue q = new FetcherTaskQueue(); + de.lanlab.larm.net.HostManager hm = new de.lanlab.larm.net.HostManager(10); System.out.println("Test 1. put in 4 yahoos and 3 lmus. pull out LMU/Yahoo/LMU/Yahoo/LMU/Yahoo/Yahoo"); try { - q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/1"), null, false, null))); - q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/2"), null, false, null))); - q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/1"), null, false, null))); - q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/2"), null, false, null))); - q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/3"), null, false, null))); - q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/4"), null, false, null))); - q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/3"), null, false, null))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/1"), null, false, null, hm))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/2"), null, false, null, hm))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/1"), null, false, null, hm))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/2"), null, false, null, hm))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/3"), null, false, null, hm))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/4"), null, false, null, hm))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/3"), null, false, null, hm))); } catch (Throwable t) { @@ -217,9 +218,9 @@ try { System.out.println("put 3 lmus."); - q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/1"), null, false, null))); - q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/2"), null, false, null))); - q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/3"), null, false, null))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/1"), null, false, null, hm))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/2"), null, false, null, hm))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/3"), null, false, null, hm))); System.out.print("pull out 1st element [lmu/1]: "); System.out.println(((FetcherTask) q.remove()).getInfo()); System.out.println("size now [2]: " + q.size()); @@ -227,9 +228,9 @@ System.out.println(((FetcherTask) q.remove()).getInfo()); System.out.println("size now [1]: " + q.size()); System.out.println("put in 3 yahoos"); - q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/1"), null, false, null))); - q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/2"), null, false, null))); - q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/3"), null, false, null))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/1"), null, false, null, hm))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/2"), null, false, null, hm))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/3"), null, false, null, hm))); System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo()); System.out.println("Size now [3]: " + q.size()); System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo()); @@ -237,7 +238,7 @@ System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo()); System.out.println("Size now [1]: " + q.size()); System.out.println("put in another Yahoo"); - q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/4"), null, false, null))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/4"), null, false, null, hm))); System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo()); System.out.println("Size now [1]: " + q.size()); System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo()); 1.3 +2 -1 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherThread.java Index: FetcherThread.java =================================================================== RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherThread.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- FetcherThread.java 22 May 2002 23:09:17 -0000 1.2 +++ FetcherThread.java 17 Jun 2002 13:59:28 -0000 1.3 @@ -56,6 +56,7 @@ import de.lanlab.larm.threads.ServerThread; import de.lanlab.larm.util.State; +import de.lanlab.larm.net.HostManager; /** * a server thread for the thread pool that records the number 1.3 +75 -58 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherThreadFactory.java Index: FetcherThreadFactory.java =================================================================== RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherThreadFactory.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- FetcherThreadFactory.java 22 May 2002 23:09:17 -0000 1.2 +++ FetcherThreadFactory.java 17 Jun 2002 13:59:28 -0000 1.3 @@ -1,64 +1,69 @@ -/* ==================================================================== - * The Apache Software License, Version 1.1 - * - * Copyright (c) 2001 The Apache Software Foundation. All rights - * reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. The end-user documentation included with the redistribution, - * if any, must include the following acknowledgment: - * "This product includes software developed by the - * Apache Software Foundation (http://www.apache.org/)." - * Alternately, this acknowledgment may appear in the software itself, - * if and wherever such third-party acknowledgments normally appear. - * - * 4. The names "Apache" and "Apache Software Foundation" and - * "Apache Lucene" must not be used to endorse or promote products - * derived from this software without prior written permission. For - * written permission, please contact apache@apache.org. - * - * 5. Products derived from this software may not be called "Apache", - * "Apache Lucene", nor may "Apache" appear in their name, without - * prior written permission of the Apache Software Foundation. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF - * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * ==================================================================== - * - * This software consists of voluntary contributions made by many - * individuals on behalf of the Apache Software Foundation. For more - * information on the Apache Software Foundation, please see - * . +/* + * ==================================================================== + * The Apache Software License, Version 1.1 + * + * Copyright (c) 2001 The Apache Software Foundation. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. The end-user documentation included with the redistribution, + * if any, must include the following acknowledgment: + * "This product includes software developed by the + * Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowledgment may appear in the software itself, + * if and wherever such third-party acknowledgments normally appear. + * + * 4. The names "Apache" and "Apache Software Foundation" and + * "Apache Lucene" must not be used to endorse or promote products + * derived from this software without prior written permission. For + * written permission, please contact apache@apache.org. + * + * 5. Products derived from this software may not be called "Apache", + * "Apache Lucene", nor may "Apache" appear in their name, without + * prior written permission of the Apache Software Foundation. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . */ - package de.lanlab.larm.fetcher; import de.lanlab.larm.threads.*; +import de.lanlab.larm.net.*; /** - * this factory simply creates fetcher threads. It's passed - * to the ThreadPool because the pool is creating the threads on its own - * @version $Id$ + * this factory simply creates fetcher threads. It's passed to the ThreadPool + * because the pool is creating the threads on its own + * + * @author Administrator + * @created 14. Juni 2002 + * @version $Id: FetcherThreadFactory.java,v 1.2 2002/05/22 23:09:17 + * cmarschner Exp $ */ public class FetcherThreadFactory extends ThreadFactory { @@ -69,16 +74,28 @@ HostManager hostManager; + + /** + * Constructor for the FetcherThreadFactory object + * + * @param hostManager Description of the Parameter + */ public FetcherThreadFactory(HostManager hostManager) { this.hostManager = hostManager; } - public ServerThread createServerThread(int count) + /** + * Description of the Method + * + * @param count Description of the Parameter + * @return Description of the Return Value + */ + public ServerThread createServerThread(int count) { ServerThread newThread = new FetcherThread(count, threadGroup, hostManager); newThread.setPriority(4); return newThread; } -} \ No newline at end of file +} 1.3 +14 -13 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/RobotExclusionFilter.java Index: RobotExclusionFilter.java =================================================================== RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/RobotExclusionFilter.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- RobotExclusionFilter.java 22 May 2002 23:09:17 -0000 1.2 +++ RobotExclusionFilter.java 17 Jun 2002 13:59:28 -0000 1.3 @@ -63,6 +63,7 @@ import de.lanlab.larm.util.*; import de.lanlab.larm.threads.*; import HTTPClient.*; +import de.lanlab.larm.net.*; /** * this factory simply creates fetcher threads. It's gonna be passed to the @@ -164,13 +165,13 @@ URLMessage urlMsg = ((URLMessage) message); URL url = urlMsg.getUrl(); //assert url != null; - HostInfo h = hostManager.getHostInfo(url.getHost()); + HostInfo h = hostManager.getHostInfo(url.getHost().toLowerCase()); if (!h.isRobotTxtChecked() && !h.isLoadingRobotsTxt()) { log.logThreadSafe("handleRequest: starting to get robots.txt"); // probably this results in Race Conditions here - rePool.doTask(new RobotExclusionTask(h), new Integer(h.id)); + rePool.doTask(new RobotExclusionTask(h), new Integer(h.getId())); h.setLoadingRobotsTxt(true); } @@ -182,7 +183,7 @@ //log.logThreadSafe("handleRequest: other thread is loading"); // assert h.queuedRequests != null - h.queuedRequests.insert(message); + h.insertIntoQueue(message); // not thread safe log.logThreadSafe("handleRequest: queued file " + url); return null; @@ -273,14 +274,14 @@ // assert hostInfo != null; String threadName = Thread.currentThread().getName(); - log.logThreadSafe("task " + threadName + ": starting to load " + hostInfo.hostName); + log.logThreadSafe("task " + threadName + ": starting to load " + hostInfo.getHostName()); //hostInfo.setLoadingRobotsTxt(true); String[] disallows = null; boolean errorOccured = false; try { log.logThreadSafe("task " + threadName + ": getting connection"); - HTTPConnection conn = new HTTPConnection(hostInfo.hostName); + HTTPConnection conn = new HTTPConnection(hostInfo.getHostName()); conn.setTimeout(30000); // wait at most 20 secs @@ -348,8 +349,8 @@ // crawl everything hostInfo.setLoadingRobotsTxt(false); log.logThreadSafe("task " + threadName + ": error occured"); - log.logThreadSafe("task " + threadName + ": now put " + hostInfo.queuedRequests.size() + " queueud requests back"); - hostInfo.isLoadingRobotsTxt = false; + log.logThreadSafe("task " + threadName + ": now put " + hostInfo.getQueueSize() + " queueud requests back"); + hostInfo.setLoadingRobotsTxt(false); putBackURLs(); } } @@ -359,8 +360,8 @@ { hostInfo.setRobotsChecked(true, disallows); log.logThreadSafe("task " + threadName + ": done"); - log.logThreadSafe("task " + threadName + ": now put " + hostInfo.queuedRequests.size() + " queueud requests back"); - hostInfo.isLoadingRobotsTxt = false; + log.logThreadSafe("task " + threadName + ": now put " + hostInfo.getQueueSize() + " queueud requests back"); + hostInfo.setLoadingRobotsTxt(false); putBackURLs(); } } @@ -373,12 +374,12 @@ */ private void putBackURLs() { - while (hostInfo.queuedRequests.size() > 0) + while (hostInfo.getQueueSize() > 0) { - messageHandler.putMessage((Message) hostInfo.queuedRequests.remove()); + messageHandler.putMessage((Message) hostInfo.removeFromQueue()); } log.logThreadSafe("task " + Thread.currentThread().getName() + ": finished"); - hostInfo.queuedRequests = null; + hostInfo.removeQueue(); } 1.3 +2 -1 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/ThreadMonitor.java Index: ThreadMonitor.java =================================================================== RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/ThreadMonitor.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- ThreadMonitor.java 22 May 2002 23:09:17 -0000 1.2 +++ ThreadMonitor.java 17 Jun 2002 13:59:28 -0000 1.3 @@ -61,6 +61,7 @@ import java.io.*; import de.lanlab.larm.util.State; import de.lanlab.larm.util.SimpleLoggerManager; +import de.lanlab.larm.net.*; /** * this monitor takes a sample of every thread every x milliseconds, 1.3 +177 -60 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLMessage.java Index: URLMessage.java =================================================================== RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLMessage.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- URLMessage.java 22 May 2002 23:09:17 -0000 1.2 +++ URLMessage.java 17 Jun 2002 13:59:28 -0000 1.3 @@ -1,66 +1,71 @@ -/* ==================================================================== - * The Apache Software License, Version 1.1 +/* + * ==================================================================== + * The Apache Software License, Version 1.1 * - * Copyright (c) 2001 The Apache Software Foundation. All rights - * reserved. + * Copyright (c) 2001 The Apache Software Foundation. All rights + * reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. The end-user documentation included with the redistribution, - * if any, must include the following acknowledgment: - * "This product includes software developed by the - * Apache Software Foundation (http://www.apache.org/)." - * Alternately, this acknowledgment may appear in the software itself, - * if and wherever such third-party acknowledgments normally appear. - * - * 4. The names "Apache" and "Apache Software Foundation" and - * "Apache Lucene" must not be used to endorse or promote products - * derived from this software without prior written permission. For - * written permission, please contact apache@apache.org. - * - * 5. Products derived from this software may not be called "Apache", - * "Apache Lucene", nor may "Apache" appear in their name, without - * prior written permission of the Apache Software Foundation. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF - * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * ==================================================================== - * - * This software consists of voluntary contributions made by many - * individuals on behalf of the Apache Software Foundation. For more - * information on the Apache Software Foundation, please see - * . + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. The end-user documentation included with the redistribution, + * if any, must include the following acknowledgment: + * "This product includes software developed by the + * Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowledgment may appear in the software itself, + * if and wherever such third-party acknowledgments normally appear. + * + * 4. The names "Apache" and "Apache Software Foundation" and + * "Apache Lucene" must not be used to endorse or promote products + * derived from this software without prior written permission. For + * written permission, please contact apache@apache.org. + * + * 5. Products derived from this software may not be called "Apache", + * "Apache Lucene", nor may "Apache" appear in their name, without + * prior written permission of the Apache Software Foundation. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . */ - package de.lanlab.larm.fetcher; import java.net.*; import java.io.*; import de.lanlab.larm.util.URLUtils; +import de.lanlab.larm.net.URLNormalizer; +import de.lanlab.larm.net.HostManager; /** * represents a URL which is passed around in the messageHandler - * @version $Id$ + * + * @author Administrator + * @created 14. Juni 2002 + * @version $Id$ */ public class URLMessage implements Message, Serializable { @@ -68,14 +73,51 @@ * the URL */ protected URL url; - protected String urlString; + /** + * Description of the Field + */ + protected volatile String urlString; + + /** + * referer or null + */ protected URL referer; - protected String refererString; + + /** + * externalized referer URL, to prevent multiple calls to url.toExternalForm() + */ + protected volatile String refererString; + + /** + * externalized referer URL, to prevent multiple calls to url.toExternalForm() + */ + protected volatile String refererNormalizedString; + + /** + * normalized URL, as defined by {@link de.lanlab.larm.net.URLNormalizer} + * (lower case, index.* removed, all characters except alphanumeric ones escaped) + */ + protected String normalizedURLString; + + boolean isFrame; + + /** + * anchor text, as in <a href="...">Anchor</a> + */ protected String anchor; - public URLMessage(URL url, URL referer, boolean isFrame, String anchor) + + /** + * Constructor for the URLMessage object + * + * @param url Description of the Parameter + * @param referer Description of the Parameter + * @param isFrame Description of the Parameter + * @param anchor Description of the Parameter + */ + public URLMessage(URL url, URL referer, boolean isFrame, String anchor, HostManager hostManager) { //super(); this.url = url; @@ -83,69 +125,144 @@ this.referer = referer; this.refererString = referer != null ? URLUtils.toExternalFormNoRef(referer) : null; + this.refererNormalizedString = referer != null ? URLUtils.toExternalFormNoRef(URLNormalizer.normalize(referer, hostManager)) : null; this.isFrame = isFrame; this.anchor = anchor != null ? anchor : ""; + this.normalizedURLString = URLUtils.toExternalFormNoRef(URLNormalizer.normalize(url, hostManager)); + //this.normalizedURLString = URLNormalizer. //System.out.println("" + refererString + " -> " + urlString); } + public String getNormalizedURLString() + { + return this.normalizedURLString; + } + + /** + * Gets the url attribute of the URLMessage object + * + * @return The url value + */ public URL getUrl() { return this.url; } + + /** + * Gets the referer attribute of the URLMessage object + * + * @return The referer value + */ public URL getReferer() { return this.referer; } + /** + * Description of the Method + * + * @return Description of the Return Value + */ public String toString() { return urlString; } + + /** + * Gets the uRLString attribute of the URLMessage object + * + * @return The uRLString value + */ public String getURLString() { return urlString; } + + /** + * Gets the refererString attribute of the URLMessage object + * + * @return The refererString value + */ public String getRefererString() { return refererString; } + + /** + * Gets the anchor attribute of the URLMessage object + * + * @return The anchor value + */ public String getAnchor() { return anchor; } + /** + * Description of the Method + * + * @return Description of the Return Value + */ public int hashCode() { return url.hashCode(); } - private void writeObject(java.io.ObjectOutputStream out) throws IOException + + /** + * Description of the Method + * + * @param out Description of the Parameter + * @exception IOException Description of the Exception + */ + private void writeObject(java.io.ObjectOutputStream out) + throws IOException { out.writeObject(url); out.writeObject(referer); out.writeBoolean(isFrame); out.writeUTF(anchor); + out.writeUTF(refererNormalizedString); + out.writeUTF(normalizedURLString); + } - private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException + + /** + * Description of the Method + * + * @param in Description of the Parameter + * @exception IOException Description of the Exception + * @exception ClassNotFoundException Description of the Exception + */ + private void readObject(java.io.ObjectInputStream in) + throws IOException, ClassNotFoundException { - url = (URL)in.readObject(); - referer = (URL)in.readObject(); + url = (URL) in.readObject(); + referer = (URL) in.readObject(); urlString = url.toExternalForm(); refererString = referer.toExternalForm(); isFrame = in.readBoolean(); anchor = in.readUTF(); + refererNormalizedString = in.readUTF(); + normalizedURLString = in.readUTF(); } + + /** + * Gets the info attribute of the URLMessage object + * + * @return The info value + */ public String getInfo() { - return (referer != null ? refererString : "") + "\t" + urlString + "\t" + (isFrame ? "1" : "0") + "\t" + anchor; + return (referer != null ? refererString : "") + "\t" + urlString + "\t" + this.getNormalizedURLString() + "\t" + (isFrame ? "1" : "0") + "\t" + anchor; } } 1.4 +2 -2 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLVisitedFilter.java Index: URLVisitedFilter.java =================================================================== RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLVisitedFilter.java,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- URLVisitedFilter.java 1 Jun 2002 18:55:15 -0000 1.3 +++ URLVisitedFilter.java 17 Jun 2002 13:59:28 -0000 1.4 @@ -123,7 +123,7 @@ { URLMessage urlMessage = ((URLMessage) message); URL url = urlMessage.getUrl(); - String urlString = urlMessage.getURLString(); + String urlString = urlMessage.getNormalizedURLString(); if (urlHash.contains(urlString)) { //System.out.println("URLVisitedFilter: " + urlString + " already present."); -- To unsubscribe, e-mail: For additional commands, e-mail: