lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From cmarsch...@apache.org
Subject cvs commit: jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher Fetcher.java FetcherMain.java FetcherTask.java
Date Tue, 18 Jun 2002 00:45:10 GMT
cmarschner    2002/06/17 17:45:10

  Modified:    contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher
                        Fetcher.java FetcherMain.java FetcherTask.java
  Log:
  added experimental version of LuceneStorage
  
  Revision  Changes    Path
  1.5       +7 -1      jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Fetcher.java
  
  Index: Fetcher.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Fetcher.java,v
  retrieving revision 1.4
  retrieving revision 1.5
  diff -u -r1.4 -r1.5
  --- Fetcher.java	17 Jun 2002 13:59:28 -0000	1.4
  +++ Fetcher.java	18 Jun 2002 00:45:10 -0000	1.5
  @@ -93,6 +93,11 @@
        */
       DocumentStorage storage;
   
  +   /**
  +     * the storage where the links are saved to
  +     */
  +    LinkStorage linkStorage;
  +
       /**
        * the host manager keeps track of host information
        */
  @@ -110,6 +115,7 @@
       public Fetcher(int maxThreads, DocumentStorage docStorage, LinkStorage linkStorage,
HostManager hostManager)
       {
           this.storage = storage;
  +        this.linkStorage = linkStorage;
           FetcherTask.setDocStorage(docStorage);
           FetcherTask.setLinkStorage(linkStorage);
           fetcherPool = new ThreadPool(maxThreads, new FetcherThreadFactory(hostManager));
  
  
  
  1.5       +12 -3     jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java
  
  Index: FetcherMain.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java,v
  retrieving revision 1.4
  retrieving revision 1.5
  diff -u -r1.4 -r1.5
  --- FetcherMain.java	17 Jun 2002 13:59:28 -0000	1.4
  +++ FetcherMain.java	18 Jun 2002 00:45:10 -0000	1.5
  @@ -183,10 +183,19 @@
   
   
           StoragePipeline storage = new StoragePipeline();
  -        storage.addDocStorage(new LogStorage(storeLog, /* save in page files? */ false,
/* logfile prefix */ "logs/pagefile"));
  +        //storage.addDocStorage(new LogStorage(storeLog, /* save in page files? */ false,
/* logfile prefix */ "logs/pagefile"));
           storage.addLinkStorage(new LinkLogStorage(linksLog));
           storage.addLinkStorage(messageHandler);
  -        //storage.addStorage(new LuceneStorage(...));
  +
  +        LuceneStorage luceneStorage = new LuceneStorage();
  +        luceneStorage.setAnalyzer(new org.apache.lucene.analysis.de.GermanAnalyzer());
  +        luceneStorage.setCreate(true);
  +        luceneStorage.setIndexName("luceneIndex");
  +        luceneStorage.setFieldInfo("url", LuceneStorage.INDEX | LuceneStorage.STORE);
  +        luceneStorage.setFieldInfo("content", LuceneStorage.INDEX | LuceneStorage.STORE
| LuceneStorage.TOKEN);
  +        storage.addDocStorage(luceneStorage);
  +        storage.open();
  +
           //storage.addStorage(new JMSStorage(...));
   
           // a third example would be the NullStorage, which converts the documents into
  
  
  
  1.5       +23 -13    jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTask.java
  
  Index: FetcherTask.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTask.java,v
  retrieving revision 1.4
  retrieving revision 1.5
  diff -u -r1.4 -r1.5
  --- FetcherTask.java	17 Jun 2002 13:58:33 -0000	1.4
  +++ FetcherTask.java	18 Jun 2002 00:45:10 -0000	1.5
  @@ -80,6 +80,8 @@
    * the storage. If it's an HTML document, it will be parsed and all links will
    * be put into the message handler again.
    *
  + * stores contents of the files in field "contents"
  + *
    * @author    Clemens Marschner
    * @version $Id$
    */
  @@ -406,32 +408,40 @@
                           Tokenizer tok = new Tokenizer();
                           tok.setLinkHandler(this);
                           tok.parse(new SimpleCharArrayReader(fullCharBuffer));
  +
  +                        taskState.setState(FT_STORING, ipURL);
  +                        linkStorage.storeLinks(foundUrls);
  +                        WebDocument d = new WebDocument(contextUrl, contentType, statusCode,
actURLMessage.getReferer(), contentLength, title, date, hostManager);
  +                        d.addField("content", fullCharBuffer);
  +                        docStorage.store(d);
                       }
                       else
                       {
                           // System.out.println("Discovered unknown content type: " + contentType
+ " at " + urlString);
  -                        errorLog.log("[" + threadNr + "] Discovered unknown content type
at " + urlString + ": " + contentType + ". just storing");
  +                        //errorLog.log("[" + threadNr + "] Discovered unknown content type
at " + urlString + ": " + contentType + ". just storing");
  +                        taskState.setState(FT_STORING, ipURL);
  +                        linkStorage.storeLinks(foundUrls);
  +                        WebDocument d = new WebDocument(contextUrl, contentType, statusCode,
actURLMessage.getReferer(), contentLength, title, date, hostManager);
  +                        d.addField("content", fullBuffer);
  +                        docStorage.store(d);
                       }
                       log.log("scanned");
                   }
  -                taskState.setState(FT_STORING, ipURL);
  -                linkStorage.storeLinks(foundUrls);
  -                //messageHandler.putMessages(foundUrls);
  -                docStorage.store(new WebDocument(contextUrl, contentType, fullBuffer, statusCode,
actURLMessage.getReferer(), contentLength, title, hostManager));
  +
                   log.log("stored");
               }
           }
           catch (InterruptedIOException e)
           {
               // timeout while reading this file
  -            System.out.println("[" + threadNr + "] FetcherTask: Timeout while opening:
" + this.actURLMessage.getUrl());
  +            //System.out.println("[" + threadNr + "] FetcherTask: Timeout while opening:
" + this.actURLMessage.getUrl());
               errorLog.log("error: Timeout: " + this.actURLMessage.getUrl());
               hi.badRequest();
           }
           catch (FileNotFoundException e)
           {
               taskState.setState(FT_EXCEPTION);
  -            System.out.println("[" + threadNr + "] FetcherTask: File not Found: " + this.actURLMessage.getUrl());
  +            //System.out.println("[" + threadNr + "] FetcherTask: File not Found: " + this.actURLMessage.getUrl());
               errorLog.log("error: File not Found: " + this.actURLMessage.getUrl());
           }
           catch(NoRouteToHostException e)
  @@ -439,7 +449,7 @@
               // router is down or firewall prevents to connect
               hi.setReachable(false);
               taskState.setState(FT_EXCEPTION);
  -            System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " +
e.getMessage());
  +            //System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": "
+ e.getMessage());
               // e.printStackTrace();
               errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage());
           }
  @@ -448,14 +458,14 @@
               // no server is listening at this port
               hi.setReachable(false);
               taskState.setState(FT_EXCEPTION);
  -            System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " +
e.getMessage());
  +            //System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": "
+ e.getMessage());
               // e.printStackTrace();
               errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage());
           }
           catch (SocketException e)
           {
               taskState.setState(FT_EXCEPTION);
  -            System.out.println("[" + threadNr + "]: SocketException:" + e.getMessage());
  +            //System.out.println("[" + threadNr + "]: SocketException:" + e.getMessage());
               errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage());
   
           }
  @@ -464,7 +474,7 @@
               // IP Address not to be determined
               hi.setReachable(false);
               taskState.setState(FT_EXCEPTION);
  -            System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " +
e.getMessage());
  +            //System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": "
+ e.getMessage());
               // e.printStackTrace();
               errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage());
   
  @@ -472,7 +482,7 @@
           catch (IOException e)
           {
               taskState.setState(FT_EXCEPTION);
  -            System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " +
e.getMessage());
  +            //System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": "
+ e.getMessage());
               // e.printStackTrace();
               errorLog.log("error: IOException: " + e.getClass().getName() + ": " + e.getMessage());
   
  
  
  

--
To unsubscribe, e-mail:   <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>


Mime
View raw message