lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From cmarsch...@apache.org
Subject cvs commit: jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher URLScopeFilter.java
Date Tue, 22 Oct 2002 15:21:00 GMT
cmarschner    2002/10/22 08:21:00

  Modified:    contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher
                        URLScopeFilter.java
  Log:
  takes normalized URL string for comparisons; added logging
  
  Revision  Changes    Path
  1.3       +9 -4      jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLScopeFilter.java
  
  Index: URLScopeFilter.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLScopeFilter.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- URLScopeFilter.java	22 May 2002 23:09:17 -0000	1.2
  +++ URLScopeFilter.java	22 Oct 2002 15:21:00 -0000	1.3
  @@ -57,6 +57,7 @@
   import org.apache.oro.text.regex.Perl5Matcher;
   import org.apache.oro.text.regex.Perl5Compiler;
   import org.apache.oro.text.regex.Pattern;
  +import de.lanlab.larm.util.*;
   
   /**
    * filter class. Tries to match a regular expression with an incoming URL
  @@ -77,11 +78,13 @@
       private Pattern pattern;
       private Perl5Matcher matcher;
       private Perl5Compiler compiler;
  +    SimpleLogger log;
   
  -    public URLScopeFilter()
  +    public URLScopeFilter(SimpleLogger log)
       {
               matcher = new Perl5Matcher();
               compiler = new Perl5Compiler();
  +            this.log = log;
       }
   
       public String getRexString()
  @@ -108,7 +111,7 @@
       {
           if(message instanceof URLMessage)
           {
  -            String urlString = ((URLMessage)message).toString();
  +            String urlString = ((URLMessage)message).getNormalizedURLString();
               int length = urlString.length();
               char buffer[] = new char[length];
               urlString.getChars(0,length,buffer,0);
  @@ -117,8 +120,10 @@
               boolean match = matcher.matches(buffer, pattern);
               if(!match)
               {
  -                //System.out.println("not in Scope: " + urlString);
  +                //log.log("URLScopeFilter: not in scope: " + urlString);
  +                log.log(message.toString());
                   filtered++;
  +
                   return null;
               }
           }
  
  
  

--
To unsubscribe, e-mail:   <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>


Mime
View raw message