lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From cmarsch...@apache.org
Subject cvs commit: jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher FetcherMain.java
Date Tue, 22 Oct 2002 15:05:07 GMT
cmarschner    2002/10/22 08:05:07

  Modified:    contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher
                        FetcherMain.java
  Log:
  more than one start URL; hostResolver
  
  Revision  Changes    Path
  1.7       +186 -46   jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java
  
  Index: FetcherMain.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java,v
  retrieving revision 1.6
  retrieving revision 1.7
  diff -u -r1.6 -r1.7
  --- FetcherMain.java	15 Sep 2002 00:38:14 -0000	1.6
  +++ FetcherMain.java	22 Oct 2002 15:05:07 -0000	1.7
  @@ -56,12 +56,12 @@
   
   import de.lanlab.larm.threads.ThreadPoolObserver;
   import de.lanlab.larm.threads.ThreadPool;
  -import de.lanlab.larm.gui.*;
   import de.lanlab.larm.util.*;
   import de.lanlab.larm.storage.*;
   import de.lanlab.larm.net.*;
   import HTTPClient.*;
   import org.apache.oro.text.regex.MalformedPatternException;
  +import java.io.*;
   import java.net.MalformedURLException;
   import java.net.URL;
   import java.util.*;
  @@ -110,6 +110,17 @@
       protected RobotExclusionFilter reFilter;
   
       /**
  +     * the host manager keeps track of all hosts and is used by the filters.
  +     */
  +    protected HostManager hostManager;
  +
  +    /**
  +     * the host resolver can change a host that occurs within a URL to a different
  +     * host, depending on the rules specified in a configuration file
  +     */
  +    protected HostResolver hostResolver;
  +
  +    /**
        * this rather flaky filter just filters out some URLs, i.e. different views
        * of Apache the apache DirIndex module. Has to be made
        * configurable in near future
  @@ -122,10 +133,6 @@
        */
       protected URLLengthFilter urlLengthFilter;
   
  -    /**
  -     * the host manager keeps track of all hosts and is used by the filters.
  -     */
  -    protected HostManager hostManager;
   
       /**
        * this is the main document fetcher. It contains a thread pool that fetches the
  @@ -152,7 +159,7 @@
        *
        * @param nrThreads  number of fetcher threads to be created
        */
  -    private FetcherMain(int nrThreads)
  +    public FetcherMain(int nrThreads, String hostResolverFile) throws Exception
       {
           // to make things clear, this method is commented a bit better than
           // the rest of the program...
  @@ -172,44 +179,73 @@
   
           // the LogStorage used here does extensive logging. It logs all links and
           // document information.
  -        // it also saves all documents to page files. Probably this single storage
  -        // could also be replaced by a pipeline; or even incorporated into the
  -        // existing message pipeline
  -        SimpleLogger storeLog = new SimpleLogger("store", false);
  -        SimpleLogger linksLog = new SimpleLogger("links", false);
  -
  +        // it also saves all documents to page files.
  +        File logsDir = new File("logs");
  +        logsDir.mkdir();    // ensure log directory exists
  +
  +        // in this experimental implementation, the crawler is pretty verbose
  +        // the SimpleLogger, however, is a FlyWeight logger which is buffered and
  +        // not thread safe by default
  +        SimpleLogger storeLog = new SimpleLogger("store", /* add date/time? */ false);
  +        SimpleLogger visitedLog = new SimpleLogger("URLVisitedFilter", /* add date/time?
*/ false);
  +        SimpleLogger scopeLog = new SimpleLogger("URLScopeFilter", /* add date/time? */
false);
  +        SimpleLogger pathsLog = new SimpleLogger("KnownPathsFilter", /* add date/time?
*/ false);
  +        SimpleLogger linksLog = new SimpleLogger("links", /* add date/time? */ false);
  +        SimpleLogger lengthLog = new SimpleLogger("length", /* add date/time? */ false);
   
           StoragePipeline storage = new StoragePipeline();
  -        //storage.addDocStorage(new LogStorage(storeLog, /* save in page files? */ false,
/* logfile prefix */ "logs/pagefile"));
  +
  +
  +        // in the default configuration, the crawler will only save the document
  +        // information to store.log and the link information to links.log
  +        // The contents of the files are _not_ saved. If you set
  +        // "save in page files" to "true", they will be saved in "page files",
  +        // binary files each containing a set of documents. Here, the
  +        // maximum file size is ~50 MB (crawled files won't be split up into different
  +        // files). The logs/store.log file contains pointers to these files: a page
  +        // file number, the offset within that file, and the document's length
  +
  +        // FIXME: default constructor for all storages + bean access methods
  +        storage.addDocStorage(new LogStorage(storeLog, /* save in page files? */ false,
  +                                             /* page file prefix */ "logs/pagefile"));
           storage.addLinkStorage(new LinkLogStorage(linksLog));
           storage.addLinkStorage(messageHandler);
  -
  +        /*
  +        // experimental Lucene storage. will slow the crawler down *a lot*
           LuceneStorage luceneStorage = new LuceneStorage();
           luceneStorage.setAnalyzer(new org.apache.lucene.analysis.de.GermanAnalyzer());
           luceneStorage.setCreate(true);
   	// FIXME: index name and path need to be configurable
           luceneStorage.setIndexName("luceneIndex");
  +        // the field names come from URLMessage.java and WebDocument.java. See
  +        // LuceneStorage source for details
           luceneStorage.setFieldInfo("url", LuceneStorage.INDEX | LuceneStorage.STORE);
           luceneStorage.setFieldInfo("content", LuceneStorage.INDEX | LuceneStorage.STORE
| LuceneStorage.TOKEN);
           storage.addDocStorage(luceneStorage);
  +        */
  +
           storage.open();
   
           //storage.addStorage(new JMSStorage(...));
   
  -        // a third example would be the NullStorage, which converts the documents into
  -        // heat, which evaporates above the processor
  -        // NullStorage();
  +        // create the filters and add them to the message queue
  +        urlScopeFilter = new URLScopeFilter(scopeLog);
   
  +        // dnsResolver = new DNSResolver();
           hostManager = new HostManager(1000);
  +        hostResolver = new HostResolver();
  +        hostResolver.initFromFile(hostResolverFile);
  +        hostManager.setHostResolver(hostResolver);
  +
  +//        hostManager.addSynonym("www.fachsprachen.uni-muenchen.de", "www.fremdsprachen.uni-muenchen.de");
  +//        hostManager.addSynonym("www.uni-muenchen.de", "www.lmu.de");
  +//        hostManager.addSynonym("www.uni-muenchen.de", "uni-muenchen.de");
  +//        hostManager.addSynonym("webinfo.uni-muenchen.de", "www.webinfo.uni-muenchen.de");
  +//        hostManager.addSynonym("webinfo.uni-muenchen.de", "webinfo.campus.lmu.de");
  +//        hostManager.addSynonym("www.s-a.uni-muenchen.de", "s-a.uni-muenchen.de");
   
  -        // create the filters and add them to the message queue
           reFilter = new RobotExclusionFilter(hostManager);
  -        urlScopeFilter = new URLScopeFilter();
  -        urlVisitedFilter = new URLVisitedFilter(100000);
  -        knownPathsFilter = new KnownPathsFilter();
  -        urlLengthFilter = new URLLengthFilter(255);
   
  -        // dnsResolver = new DNSResolver();
           fetcher = new Fetcher(nrThreads, storage, storage, hostManager);
   
           // prevent message box popups
  @@ -218,6 +254,8 @@
           // prevent GZipped files from being decoded
           HTTPConnection.removeDefaultModule(HTTPClient.ContentEncodingModule.class);
   
  +        urlVisitedFilter = new URLVisitedFilter(visitedLog, 100000);
  +
           // initialize the threads
           fetcher.init();
   
  @@ -241,12 +279,15 @@
           messageHandler.addListener(reFilter);
           messageHandler.addListener(urlVisitedFilter);
           messageHandler.addListener(knownPathsFilter);
  +
           messageHandler.addListener(fetcher);
   
  -        /* uncomment this to enable HTTPClient logging
  +         //uncomment this to enable HTTPClient logging
  +        /*
           try
           {
  -            HTTPClient.Log.setLogWriter(new java.io.FileWriter("logs/HttpClient.log"),false);
  +            HTTPClient.Log.setLogWriter(new java.io.OutputStreamWriter(System.out) //new
java.io.FileWriter("logs/HttpClient.log")
  +            ,false);
               HTTPClient.Log.setLogging(HTTPClient.Log.ALL, true);
           }
           catch (Exception e)
  @@ -254,6 +295,7 @@
               e.printStackTrace();
           }
           */
  +
       }
   
   
  @@ -276,11 +318,11 @@
        * @exception java.net.MalformedURLException  Description of Exception
        */
       public void putURL(URL url, boolean isFrame)
  -        throws java.net.MalformedURLException
  +     //   throws java.net.MalformedURLException
       {
           try
           {
  -            messageHandler.putMessage(new URLMessage(url, null, isFrame, null, this.hostManager));
  +            messageHandler.putMessage(new URLMessage(url, null, isFrame == true ? URLMessage.LINKTYPE_FRAME
: URLMessage.LINKTYPE_ANCHOR, null, this.hostResolver));
           }
           catch (Exception e)
           {
  @@ -288,7 +330,6 @@
               System.out.println("Exception: " + e.getMessage());
               e.printStackTrace();
           }
  -        //System.out.println("URLs geschrieben");
       }
   
   
  @@ -341,24 +382,69 @@
        *
        * @param args  The command line arguments
        */
  -    public static void main(String[] args)
  +    public static void main(String[] args) throws Exception
       {
           int nrThreads = 10;
   
  -        String startURL = "";
  -        String restrictTo = "http://141.84.120.82/ll/cmarschn/.*";
  +        ArrayList startURLs = new ArrayList();
  +        String restrictTo = ".*";
           boolean gui = false;
           boolean showInfo = false;
  -        System.out.println("LARM - LANLab Retrieval Machine - Fetcher - V 1.00 - (C) LANLab
2000-02");
  -
  +        String hostResolverFile = "";
  +        System.out.println("LARM - LANLab Retrieval Machine - Fetcher - V 1.00 - B.20020914");
   	// FIXME: consider using Jakarta Commons' CLI package for command line parameters
  +
           for (int i = 0; i < args.length; i++)
           {
               if (args[i].equals("-start"))
               {
                   i++;
  -                startURL = args[i];
  -                System.out.println("Start-URL set to: " + startURL);
  +                String arg = args[i];
  +                if(arg.startsWith("@"))
  +                {
  +                    // input is a file with one URL per line
  +                    String fileName = arg.substring(1);
  +                    System.out.println("reading URL file " + fileName);
  +                    try
  +                    {
  +                        BufferedReader r = new BufferedReader(new FileReader(fileName));
  +                        String line;
  +                        int count=0;
  +                        while((line = r.readLine()) != null)
  +                        {
  +                            try
  +                            {
  +                                startURLs.add(new URL(line));
  +                                count++;
  +                            }
  +                            catch (MalformedURLException e)
  +                            {
  +                                System.out.println("Malformed URL '" + line + "' in line
" + (count+1) + " of file " + fileName);
  +
  +                            }
  +                        }
  +                        r.close();
  +                        System.out.println("added " + count + " URLs from " + fileName);
  +                    }
  +                    catch(IOException e)
  +                    {
  +                        System.out.println("Couldn't read '" + fileName + "': " + e);
  +                    }
  +                }
  +                else
  +                {
  +                    System.out.println("got URL " + arg);
  +                    try
  +                    {
  +                        startURLs.add(new URL(arg));
  +                        System.out.println("Start-URL added: " + arg);
  +                    }
  +                    catch (MalformedURLException e)
  +                    {
  +                        System.out.println("Malformed URL '" + arg + "'");
  +
  +                    }
  +                }
               }
               else if (args[i].equals("-restrictto"))
               {
  @@ -372,6 +458,13 @@
                   nrThreads = Integer.parseInt(args[i]);
                   System.out.println("Threads set to " + nrThreads);
               }
  +            else if (args[i].equals("-hostresolver"))
  +            {
  +                i++;
  +                hostResolverFile = args[i];
  +                System.out.println("reading host resolver props from  '" + hostResolverFile
+ "'");
  +
  +            }
               else if (args[i].equals("-gui"))
               {
                   gui = true;
  @@ -390,10 +483,60 @@
           //URL.setURLStreamHandlerFactory(new HttpTimeoutFactory(500));
           // replaced by HTTPClient
   
  -        FetcherMain f = new FetcherMain(nrThreads);
  -        if (showInfo || (startURL.equals("") && gui == false))
  +        FetcherMain f = new FetcherMain(nrThreads, hostResolverFile);
  +        if (showInfo || "".equals(hostResolverFile) || (startURLs.isEmpty() &&
gui == false))
           {
  -            System.out.println("Usage: FetcherMain -start <URL> -restrictto <RegEx>
[-threads <nr=10>]"); // [-gui]
  +            System.out.println("The LARM crawler\n" +
  +                               "\n" +
  +                               "The LARM crawler is a fast parallel crawler, currently
designed for\n" +
  +                               "large intranets (up to a couple hundred hosts with some
hundred thousand\n" +
  +                               "documents). It is currently restricted by a relatively
high memory overhead\n" +
  +                               "per crawled host, and by a HashMap of already crawled URLs
which is also held\n" +
  +                               "in memory.\n" +
  +                               "\n" +
  +                               "Usage:   FetcherMain <-start <URL>|@<filename>>+
-restrictto <RegEx>\n" +
  +                               "                    [-threads <nr=10>] [-hostresolver
<filename>]\n" +
  +                               "\n" +
  +                               "Commands:\n" +
  +                               "         -start specify one or more URLs to start with.
You can as well specify a file" +
  +                               "                that contains URLs, one each line\n" +
  +                               "         -restrictto a Perl 5 regular expression each URL
must match. It is run against the\n" +
  +                               "                     _complete_ URL, including the http://
part\n" +
  +                               "         -threads  the number of crawling threads. defaults
to 10\n" +
  +                               "         -hostresolver specify a file that contains rules
for changing the host part of \n" +
  +                               "                       a URL during the normalization process
(experimental).\n" +
  +                               "Caution: The <RegEx> is applied to the _normalized_
form of a URL.\n" +
  +                               "         See URLNormalizer for details\n" +
  +                               "Example:\n" +
  +                               "    -start @urls1.txt -start @urls2.txt -start http://localhost/
" +
  +                               "    -restrictto http://[^/]*\\.localhost/.* -threads 25\n"
+
  +                               "\n" +
  +                               "The host resolver file may contain the following commands:
\n" +
  +                               "  startsWith(part1) = part2\n" +
  +                               "      if host starts with part1, this part will be replaced
by part2\n" +
  +                               "   endsWith(part1) = part2\n" +
  +                               "       if host ends with part1, this part will be replaced
by part2. This is done after\n" +
  +                               "       startsWith was processed\n" +
  +                               "   synonym(host1) = host2\n" +
  +                               "       the keywords startsWith, endsWith and synonym are
case sensitive\n" +
  +                               "       host1 will be replaced with host2. this is done
_after_ startsWith and endsWith was \n" +
  +                               "       processed. Due to a bug in BeanUtils, dots are not
allowed in the keys (in parentheses)\n" +
  +                               "       and have to be escaped with commas. To simplify,
commas are also replaced in property \n" +
  +                               "       values. So just use commas instead of dots. The
resulting host names are only used for \n" +
  +                               "       comparisons and do not have to be existing URLs
(although the syntax has to be valid).\n" +
  +                               "       However, the names will often be passed to java.net.URL
which will try to make a DNS name\n" +
  +                               "       resolution, which will time out if the server can't
be found. \n" +
  +                               "   Example:" +
  +                               "     synonym(www1,host,com) = host,com\n" +
  +                               "     startsWith(www,) = ,\n" +
  +                               "     endsWith(host1,com) = host,com\n" +
  +                               "The crawler will show a status message every 5 seconds,
which is printed by ThreadMonitor.java\n" +
  +                               "It will stop after the ThreadMonitor found the message
queue and the crawling threads to be idle a \n" +
  +                               "couple of times.\n" +
  +                               "The crawled data will be saved within a logs/ directory.
A cachingqueue/ directory is used for\n" +
  +                               "temporary queues.\n" +
  +                               "Note that this implementation is experimental, and that
the command line options cover only a part \n" +
  +                               "of the parameters. Much of the configuration can only be
done by modifying FetcherMain.java\n");
               System.exit(0);
           }
           try
  @@ -403,17 +546,14 @@
               if (gui)
               {
                   // f.initGui(f, startURL);
  +                // the GUI is not longer supported
               }
               else
               {
  -                try
  -                {
  -                    f.startMonitor();
  -                    f.putURL(new URL(startURL), false);
  -                }
  -                catch (MalformedURLException e)
  +                f.startMonitor();
  +                for(Iterator it = startURLs.iterator(); it.hasNext(); )
                   {
  -                    System.out.println("Malformed URL");
  +                    f.putURL((URL)it.next(), false);
                   }
               }
           }
  
  
  

--
To unsubscribe, e-mail:   <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>


Mime
View raw message