lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From cmarsch...@apache.org
Subject cvs commit: jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher RobotExclusionFilter.java
Date Tue, 22 Oct 2002 15:15:07 GMT
cmarschner    2002/10/22 08:15:07

  Modified:    contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher
                        RobotExclusionFilter.java
  Log:
  improved logging
  
  Revision  Changes    Path
  1.4       +34 -20    jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/RobotExclusionFilter.java
  
  Index: RobotExclusionFilter.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/RobotExclusionFilter.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- RobotExclusionFilter.java	17 Jun 2002 13:59:28 -0000	1.3
  +++ RobotExclusionFilter.java	22 Oct 2002 15:15:07 -0000	1.4
  @@ -121,9 +121,9 @@
        */
       public RobotExclusionFilter(HostManager hm)
       {
  -        log = new SimpleLogger("RobotExclusionFilter");
  +        log = new SimpleLogger("RobotExclusionFilter", true);
           hostManager = hm;
  -        rePool = new ThreadPool(2, new REFThreadFactory());
  +        rePool = new ThreadPool(5, new REFThreadFactory());
           rePool.init();
           log.setFlushAtOnce(true);
           log.log("refilter: initialized");
  @@ -164,19 +164,21 @@
               // assert message instanceof URLMessage;
               URLMessage urlMsg = ((URLMessage) message);
               URL url = urlMsg.getUrl();
  +//            String urlString = urlMsg.getNormalizedURLString();
  +//            URL nUrl = new URL(urlString);
               //assert url != null;
  -            HostInfo h = hostManager.getHostInfo(url.getHost().toLowerCase());
  -            if (!h.isRobotTxtChecked() && !h.isLoadingRobotsTxt())
  +            HostInfo h = hostManager.getHostInfo(url.getHost());
  +            synchronized (h)
               {
  -                log.logThreadSafe("handleRequest: starting to get robots.txt");
  -                // probably this results in Race Conditions here
  +                if (!h.isRobotTxtChecked() && !h.isLoadingRobotsTxt())
  +                {
  +                    log.logThreadSafe("handleRequest: starting to get robots.txt");
  +                    // probably this results in Race Conditions here
   
  -                rePool.doTask(new RobotExclusionTask(h), new Integer(h.getId()));
  -                h.setLoadingRobotsTxt(true);
  -            }
  +                    rePool.doTask(new RobotExclusionTask(h), new Integer(h.getId()));
  +                    h.setLoadingRobotsTxt(true);
  +                }
   
  -            synchronized (h)
  -            {
                   // isLoading...() and queuedRequest.insert() must be atomic
                   if (h.isLoadingRobotsTxt())
                   {
  @@ -271,8 +273,16 @@
            */
           public void run(ServerThread thread)
           {
  -            // assert hostInfo != null;
               String threadName = Thread.currentThread().getName();
  +            synchronized(hostInfo)
  +            {
  +                if(hostInfo.isRobotTxtChecked())
  +                {
  +                    log.logThreadSafe("task " + threadName + ": already loaded " + hostInfo.getHostName());
  +                    return;         // may happen 'cause check is not synchronized
  +                }
  +            }
  +            // assert hostInfo != null;
   
               log.logThreadSafe("task " + threadName + ": starting to load " + hostInfo.getHostName());
               //hostInfo.setLoadingRobotsTxt(true);
  @@ -290,6 +300,7 @@
                   if (res.getStatusCode() != 200)
                   {
                       errorOccured = true;
  +                    log.log("task " + threadName + ": return code was " + res.getStatusCode());
                   }
                   else
                   {
  @@ -309,26 +320,26 @@
               catch (java.net.UnknownHostException e)
               {
                   hostInfo.setReachable(false);
  -                log.logThreadSafe("task " + threadName + ": unknown host. setting to unreachable");
  +                log.logThreadSafe("task " + threadName + ": unknown host '" + hostInfo.getHostName()
+ "'. setting to unreachable");
                   errorOccured = true;
               }
               catch (java.net.NoRouteToHostException e)
               {
                   hostInfo.setReachable(false);
  -                log.logThreadSafe("task " + threadName + ": no route to. setting to unreachable");
  +                log.logThreadSafe("task " + threadName + ": no route to '"+hostInfo.getHostName()+"'.
setting to unreachable");
                   errorOccured = true;
               }
               catch (java.net.ConnectException e)
               {
                   hostInfo.setReachable(false);
  -                log.logThreadSafe("task " + threadName + ": connect exception. setting
to unreachable");
  +                log.logThreadSafe("task " + threadName + ": connect exception while connecting
to '"+hostInfo.getHostName()+"'. setting to unreachable");
                   errorOccured = true;
               }
               catch (java.io.InterruptedIOException e)
               {
                   // time out. fatal in this case
                   hostInfo.setReachable(false);
  -                log.logThreadSafe("task " + threadName + ": time out. setting to unreachable");
  +                log.logThreadSafe("task " + threadName + ": time out while connecting to
'" +hostInfo.getHostName() + "'. setting to unreachable");
                   errorOccured = true;
               }
   
  @@ -343,19 +354,20 @@
               {
                   if (errorOccured)
                   {
  +                    log.logThreadSafe("task " + threadName + ": error occured. putback...");
                       synchronized (hostInfo)
                       {
                           hostInfo.setRobotsChecked(true, null);
                           // crawl everything
                           hostInfo.setLoadingRobotsTxt(false);
  -                        log.logThreadSafe("task " + threadName + ": error occured");
                           log.logThreadSafe("task " + threadName + ": now put " + hostInfo.getQueueSize()
+ " queueud requests back");
  -                        hostInfo.setLoadingRobotsTxt(false);
  +                        //hostInfo.setLoadingRobotsTxt(false);
                           putBackURLs();
                       }
                   }
                   else
                   {
  +                    log.logThreadSafe("task " + threadName + ": finished. putback...");
                       synchronized (hostInfo)
                       {
                           hostInfo.setRobotsChecked(true, disallows);
  @@ -374,11 +386,13 @@
            */
           private void putBackURLs()
           {
  +
  +            int qSize = hostInfo.getQueueSize();
               while (hostInfo.getQueueSize() > 0)
               {
                   messageHandler.putMessage((Message) hostInfo.removeFromQueue());
               }
  -            log.logThreadSafe("task " + Thread.currentThread().getName() + ": finished");
  +            log.logThreadSafe("task " + Thread.currentThread().getName() + ": finished.
put " + qSize + " URLs back");
               hostInfo.removeQueue();
           }
   
  
  
  

--
To unsubscribe, e-mail:   <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>


Mime
View raw message