lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From cmarsch...@apache.org
Subject cvs commit: jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher KnownPathsFilter.java
Date Tue, 22 Oct 2002 15:12:43 GMT
cmarschner    2002/10/22 08:12:42

  Modified:    contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher
                        KnownPathsFilter.java
  Log:
  added logging
  
  Revision  Changes    Path
  1.3       +55 -30    jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/KnownPathsFilter.java
  
  Index: KnownPathsFilter.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/KnownPathsFilter.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- KnownPathsFilter.java	22 May 2002 23:09:17 -0000	1.2
  +++ KnownPathsFilter.java	22 Oct 2002 15:12:42 -0000	1.3
  @@ -55,6 +55,9 @@
   package de.lanlab.larm.fetcher;
   
   import java.net.*;
  +import java.util.ArrayList;
  +import java.io.*;
  +import de.lanlab.larm.util.*;
   
   /**
    * this can be considered a hack
  @@ -68,14 +71,12 @@
   
       String[] pathsToFilter =
       {
  -        "/robots.txt"
  +        "/robots.txt",
  +        "/lmu-32321800/"
       };
   
  -    String[] hostFilter =
  -    {
  -        "www.nm.informatik.uni-muenchen.de",
  -        "cgi.cip.informatik.uni-muenchen.de"
  -    };
  +    ArrayList hosts = new ArrayList();
  +    Object[] hostsToFilter = null;
   
       String[] filesToFilter =
       {
  @@ -93,18 +94,27 @@
       int pathLength;
       int fileLength;
       int hostLength;
  -
  +    SimpleLogger log;
   
       /**
        * Constructor for the KnownPathsFilter object
        */
  -    public KnownPathsFilter()
  +    public KnownPathsFilter(SimpleLogger log)
       {
           pathLength = pathsToFilter.length;
  +        this.log = log;
           fileLength = filesToFilter.length;
  -        hostLength = hostFilter.length;
       }
   
  +    /**
  +     * add "forbidden" host name
  +     * note: this has no effect after the filter has been added to the message handler
  +     * @param hostname
  +     */
  +    public void addHostToFilter(String hostname)
  +    {
  +        this.hosts.add(hostname);
  +    }
   
       /**
        * Description of the Method
  @@ -114,34 +124,47 @@
        */
       public Message handleRequest(Message message)
       {
  -        URL url = ((URLMessage)message).getUrl();
  -        String file = url.getFile();
  -        String host = url.getHost();
  -        int i;
  -        for (i = 0; i < pathLength; i++)
  +        try
           {
  -            if (file.startsWith(pathsToFilter[i]))
  +            URL url = new URL(((URLMessage)message).getNormalizedURLString());
  +            String file = url.getFile();
  +            String host = url.getHost();
  +            int i;
  +            for (i = 0; i < pathLength; i++)
               {
  -                filtered++;
  -                return null;
  +                if (file.startsWith(pathsToFilter[i]))
  +                {
  +                    filtered++;
  +                    //log.log("KnownPathsFilter: filtered file '" + url + "' - file starts
with " + pathsToFilter[i]);
  +                    log.log(message.toString());
  +                    return null;
  +                }
               }
  -        }
  -        for (i = 0; i < fileLength; i++)
  -        {
  -            if (file.endsWith(filesToFilter[i]))
  +            for (i = 0; i < fileLength; i++)
               {
  -                filtered++;
  -                return null;
  +                if (file.endsWith(filesToFilter[i]))
  +                {
  +                    filtered++;
  +                    //log.log("KnownPathsFilter: filtered file '" + url + "' - file ends
with " + filesToFilter[i]);
  +                    log.log(message.toString());
  +                    return null;
  +                }
               }
  -        }
  -        for (i = 0; i<hostLength; i++)
  -        {
  -            if(hostFilter[i].equals(host))
  +            for (i = 0; i<hostLength; i++)
               {
  -                filtered++;
  -                return null;
  +                if(hostsToFilter[i].equals(host))
  +                {
  +                    filtered++;
  +                    //log.log("KnownPathsFilter: filtered file '" + url + "' - host equals
" + host);
  +                    log.log(message.toString());
  +                    return null;
  +                }
               }
           }
  +        catch(MalformedURLException e)
  +        {
  +            e.printStackTrace();
  +        }
           return message;
       }
   
  @@ -154,5 +177,7 @@
       public void notifyAddedToMessageHandler(MessageHandler handler)
       {
           this.messageHandler = messageHandler;
  +        this.hostsToFilter = hosts.toArray();
  +        this.hostLength = hostsToFilter.length;
       }
   }
  
  
  

--
To unsubscribe, e-mail:   <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>


Mime
View raw message