lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From cmarsch...@apache.org
Subject cvs commit: jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util SimpleStringTokenizer.java StoreLogFile.java
Date Tue, 22 Oct 2002 15:40:48 GMT
cmarschner    2002/10/22 08:40:48

  Modified:    contributions/webcrawler-LARM/src/de/lanlab/larm/net
                        HostManager.java
  Added:       contributions/webcrawler-LARM/src/de/lanlab/larm/net
                        HostResolver.java
               contributions/webcrawler-LARM/src/de/lanlab/larm/util
                        SimpleStringTokenizer.java StoreLogFile.java
  Log:
  
  
  Revision  Changes    Path
  1.2       +61 -27    jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HostManager.java
  
  Index: HostManager.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HostManager.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- HostManager.java	17 Jun 2002 14:00:13 -0000	1.1
  +++ HostManager.java	22 Oct 2002 15:40:48 -0000	1.2
  @@ -55,6 +55,11 @@
   package de.lanlab.larm.net;
   
   import java.util.HashMap;
  +import java.util.*;
  +import org.apache.oro.text.perl.*;
  +import org.apache.oro.text.regex.*;
  +import org.apache.oro.text.*;
  +import org.apache.oro.util.*;
   
   /**
    * Description of the Class
  @@ -67,8 +72,12 @@
   {
       HashMap hosts;
       static int hostCount = 0;
  +    HostResolver resolver;
   
   
  +
  +//    ArrayList rewriteRules = new ArrayList();
  +
       /**
        * Constructor for the HostInfo object
        *
  @@ -79,6 +88,20 @@
           hosts = new HashMap(initialCapacity);
       }
   
  +    public void setHostResolver(HostResolver resolver)
  +    {
  +        this.resolver = resolver;
  +    }
  +
  +    /**
  +     * returns the hostResolver
  +     * @return
  +     */
  +    public HostResolver getHostResolver()
  +    {
  +        return this.resolver;
  +    }
  +
   
       /**
        * Description of the Method
  @@ -88,7 +111,20 @@
        */
       public HostInfo put(String hostName)
       {
  -        if (!hosts.containsKey(hostName))
  +        if(resolver != null)
  +        {
  +            return putResolved(hostName, resolver.resolveHost(hostName));
  +        }
  +        else
  +        {
  +            return putResolved(hostName, hostName);
  +        }
  +    }
  +
  +
  +    public HostInfo putResolved(String hostName, String resolvedHostName)
  +    {
  +        if (!hosts.containsKey(resolvedHostName))
           {
               int hostID;
               synchronized (this)
  @@ -96,44 +132,43 @@
                   hostID = hostCount++;
               }
               HostInfo hi = new HostInfo(hostName,hostID);
  -            hosts.put(hostName, hi);
  +            hosts.put(resolvedHostName, hi);
               //System.out.println("hostManager: + " + hostName);
  -            if(!hostName.equals(hostName.toLowerCase()))
  -            {
  -                try
  -                {
  -                    throw new Exception();
  -                }
  -                catch(Exception e)
  -                {
  -                    e.printStackTrace();
  -                }
  -            }
  +//            if(!hostName.equals(hostName.toLowerCase()))
  +//            {
  +//                try
  +//                {
  +//                    throw new Exception();
  +//                }
  +//                catch(Exception e)
  +//                {
  +//                    e.printStackTrace();
  +//                }
  +//            }
               return hi;
           }
           return (HostInfo)hosts.get(hostName);
  -        /*else
  -        {
  -            hostID = hosts.get()
  -        }
  -        // assert hostID != -1;
  -        return hostID;*/
  -
       }
   
   
  +    public HostInfo getHostInfo(String hostName)
  +    {
  +        return getHostInfoNormalized(hostName, resolver.resolveHost(hostName));
  +    }
  +
       /**
        * Gets the hostID attribute of the HostInfo object
        *
        * @param hostName  Description of the Parameter
        * @return          The hostID value
        */
  -    public HostInfo getHostInfo(String hostName)
  +    public HostInfo getHostInfoNormalized(String hostName, String normalizedHostName)
       {
  -        HostInfo hi = (HostInfo)hosts.get(hostName);
  +        HostInfo hi = (HostInfo)hosts.get(normalizedHostName);
           if(hi == null)
           {
  -            return put(hostName);
  +//            System.out.println("new host: " + normalizedHostName);
  +            return putResolved(hostName, normalizedHostName);
           }
           return hi;
       }
  @@ -145,9 +180,8 @@
   
       public HostInfo addSynonym(String hostName, String synonym)
       {
  -        HostInfo info = getHostInfo(hostName);
  -        hosts.put(synonym, info);
  -        return info;
  +        resolver.addSynonym(hostName, synonym);
  +        return getHostInfo(hostName);
       }
   
   
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HostResolver.java
  
  Index: HostResolver.java
  ===================================================================
  package de.lanlab.larm.net;
  
  import java.util.*;
  import xxl.collections.*;
  import java.io.*;
  import org.apache.commons.beanutils.*;
  import java.lang.reflect.*;
  import org.apache.commons.logging.*;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  
  
  
  //class LRUCache
  //{
  //    HashMap cache = null;
  //    LinkedList order = null;
  //    int max;
  //
  //    public LRUCache(int max)
  //    {
  //
  //        this.max = max;
  //        cache = new HashMap((int)(max/0.6));
  //        order = new LinkedList();
  //    }
  //
  //    public Object get(Object key)
  //    {
  //        return cache.get(key);
  //    }
  //
  //
  //
  //    public void put(Object key, Object value)
  //    {
  //        if(!cache.containsKey(key))
  //        {
  //           if(order.size() > max)
  //           {
  //               cache.remove(order.removeLast());
  //           }
  //        }
  //        else
  //        {
  //            //assert order.contains(key);
  //            order.remove(key);
  //            // quite expensive, probably need a hashed list
  //            // or something even simpler
  //        }
  //        order.addFirst(key);
  //        cache.put(key, value);
  //    }
  //}
  
  /**
   * Uses @link{#resolveHost()} which transforms a host name according to the rules
   * Rules are (and executed in this order)
   * <ul>
   * <li>if host starts with (startsWith), replace this part with (replacement)
   * <li>if host ends with (endsWith), replace it with (replacement)
   * <li>if host is (synonym), replace it with (replacement)
   * </ul>
   * the resolver can be configured through a property file, which is loaded by an
   * Apache BeanUtils property loader.<p>
   * Actually the resolver doesn't do any network calls, so this class can be used
   * with any string, if you really need to
   * @author Clemens Marschner
   * @version 1.0
   */
  public class HostResolver
  {
  
      HashMap synonym;
      public HostResolver()
      {
          synonym = new HashMap();
      }
  
      /**
       * convenience method that loads the config from a properties file
       * @param fileName a property file
       * @throws IOException thrown if fileName is wrong or something went wrong while reading
       * @throws InvocationTargetException thrown by java.util.Properties
       * @throws IllegalAccessException thrown by java.util.Properties
       */
      public void initFromFile(String fileName) throws IOException, InvocationTargetException,
IllegalAccessException
      {
          InputStream in = new FileInputStream(fileName);
          Properties p = new Properties();
          p.load(in);
          in.close();
          initFromProperties(p);
      }
  
      /**
       * populates the synonym, startsWith and endsWith properties with a BeanUtils.populate()
       * @param props
       * @throws InvocationTargetException
       * @throws IllegalAccessException
       */
      public void initFromProperties(Properties props) throws InvocationTargetException, IllegalAccessException
      {
          BeanUtils.populate(this, props);
      }
  
      ArrayList startsWithArray = new ArrayList();
      int startsWithSize = 0;
      ArrayList endsWithArray = new ArrayList();
      int endsWithSize = 0;
  
      public String getStartsWith(String name) throws IllegalAccessException
      {
          throw new IllegalAccessException("brrffz");
      }
  
      public void setStartsWith(String name, String rep)
      {
          addHostStartsWithReplace(name.replace(',','.'), rep.replace(',','.'));
      }
      public String getEndsWith(String name) throws IllegalAccessException
      {
          throw new IllegalAccessException("brrffz");
      }
      public void setEndsWith(String name, String rep)
      {
          this.addHostEndsWithReplace(name.replace(',','.'), rep.replace(',','.'));
      }
  
      public void setSynonym(String name, String syn)
      {
          addSynonym(name.replace(',','.'), syn.replace(',','.'));
      }
      public String getSynonym(String name) throws IllegalAccessException
      {
          throw new IllegalAccessException("brrffz");
      }
      public void addSynonym(String name, String syn)
      {
          System.out.println("adding synonym " + name + " -> " + syn);
          synonym.put(name, syn);
      }
  
      /**
       * transforms a host name if a rule is found
       * @param hostName
       * @return probably changed host name
       */
      public String resolveHost(String hostName)
      {
          if(hostName == null)
          {
              return null;
          }
          for(int i=0; i<startsWithSize; i++)
          {
              String[] test = (String[])startsWithArray.get(i);
              if(hostName.startsWith(test[0]))
              {
                  hostName = test[1] + hostName.substring(test[0].length());
                  break;
              }
          }
          for(int i=0; i<endsWithSize; i++)
          {
              String[] test = (String[])endsWithArray.get(i);
              if(hostName.endsWith(test[0]))
              {
                  hostName =  hostName.substring(0, hostName.length() - test[0].length())
+ test[1];
                  break;
              }
          }
          String syn = (String)synonym.get(hostName);
          return syn != null ? syn : hostName;
      }
  
      public void addHostStartsWithReplace(String startsWith, String replace)
      {
          System.out.println("adding sw replace " + startsWith + " -> " + replace);
          startsWithArray.add(new String[] { startsWith, replace });
          startsWithSize++;
      }
  
      public void addHostEndsWithReplace(String endsWith, String replace)
      {
          System.out.println("adding ew replace " + endsWith + " -> " + replace);
          endsWithArray.add(new String[] { endsWith, replace });
          endsWithSize++;
      }
  
  //    /** The pattern cache to compile and store patterns */
  //    private PatternCache __patternCache;
  //    /** The hashtable to cache higher-level expressions */
  //    private Cache __expressionCache;
  //    /** The pattern matcher to perform matching operations. */
  //    private Perl5Matcher __matcher = new Perl5Matcher();
  //
  //    public void addReplaceRegEx(String findRegEx, String replaceRegEx, boolean greedy)
  //    {
  //        int compileOptions    = Perl5Compiler.CASE_INSENSITIVE_MASK;
  //        int numSubstitutions = 1;
  //        if(greedy)
  //        {
  //            numSubstitutions = Util.SUBSTITUTE_ALL;
  //        }
  //
  //        Pattern compiledPattern = __patternCache.getPattern(findRegEx, compileOptions);
  //        Perl5Substitution substitution = new Perl5Substitution(replaceRegEx, numInterpolations);
  //        ParsedSubstitutionEntry entry = new ParsedSubstitutionEntry(compiledPattern, substitution,
 numSubstitutions);
  //        __expressionCache.addElement(expression, entry);
  //
  //        result = Util.substitute(__matcher, compiledPattern, substitution,
  //                     input, numSubstitutions);
  //
  //        __lastMatch = __matcher.getMatch();
  //
  //        return result;
  //    }
  
  }
  
  
  1.1                  jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/SimpleStringTokenizer.java
  
  Index: SimpleStringTokenizer.java
  ===================================================================
  package de.lanlab.larm.util;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  
  /**
   * A simple string tokenizer that regards <b>one</b> character as a delimiter.
   * Compared to Sun's StringTokenizer, it returns an empty token if two
   * subsequent delimiters are found
   *
   * @author    Clemens Marschner
   * @created   24. März 2002
   */
  public class SimpleStringTokenizer
  {
  
      String string;
  
      int currPos;
      int maxPos;
      char delim;
  
  
      /**
       * Constructor for the SimpleStringTokenizer object
       *
       * @param string  the string to be tokenized
       * @param delim   the delimiter that splits the string
       */
      public SimpleStringTokenizer(String string, char delim)
      {
          setString(string);
          setDelim(delim);
      }
  
  
      /**
       * sets the delimiter. The tokenizer is not reset.
       *
       * @param delim  The new delim value
       */
      public void setDelim(char delim)
      {
          this.delim = delim;
      }
  
  
      /**
       * sets the string and reinitializes the tokenizer. Allows for reusing the
       * tokenizer object
       *
       * @param string  string to be tokenized
       */
      public void setString(String string)
      {
          this.string = string;
          reset();
  
          maxPos = string.length() - 1;
      }
  
  
      /**
       * resets the tokenizer. It will act like newly created
       */
      public void reset()
      {
          currPos = 0;
      }
  
  
      /**
       * returns true if the end is not reached
       *
       * @return   false if the end is reached.
       */
      public boolean hasMore()
      {
          return currPos <= maxPos;
      }
  
  
      /**
       * returns the next token from the stream. returns an empty string if the
       * end is reached
       *
       * @return   Description of the Return Value
       * @see      java.util.StringTokenizer#nextToken
       */
      public String nextToken()
      {
          int nextPos = string.indexOf(delim, currPos);
          if (nextPos == -1)
          {
              nextPos = maxPos + 1;
          }
          String sub;
          if (nextPos > currPos)
          {
              sub = string.substring(currPos, nextPos);
          }
          else
          {
              sub = "";
          }
          currPos = nextPos + 1;
          return sub;
      }
  }
  
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/StoreLogFile.java
  
  Index: StoreLogFile.java
  ===================================================================
  package de.lanlab.larm.util;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import java.io.*;
  import java.util.*;
  import de.lanlab.larm.parser.*;
  import java.net.*;
  import de.lanlab.larm.fetcher.*;
  import de.lanlab.larm.net.*;
  
  /**
   * Utility class for accessing page files through the store.log file.
   * Works like an iterator
   */
  public class StoreLogFile implements Iterator
  {
  
      public void remove()
      {
          throw new UnsupportedOperationException();
      }
  
  
      /**
       * @author Clemens Marschner
       * @version 1.0
       */
      public class PageFileEntry
      {
          String url;
          int pageFileNo;
          int resultCode;
          String mimeType;
          int size;
          String title;
          int pageFileOffset;
          File pageFileDirectory;
          boolean hasPageFileEntry;
          int isFrame;
  
          class PageFileInputStream extends InputStream
          {
              InputStream pageFileIS;
              long offset;
  
              public PageFileInputStream() throws IOException
              {
                  pageFileIS = new FileInputStream(new File(pageFileDirectory, "pagefile_"
+ pageFileNo + ".pfl"));
                  offset = 0;
                  pageFileIS.skip(pageFileOffset);
              }
              public int available() throws IOException
              {
                  return Math.min(pageFileIS.available(), (int)(size - offset));
              }
              public void close() throws IOException
              {
                  pageFileIS.close();
              }
              public void mark(int readLimit)
              {
                  throw new UnsupportedOperationException();
              }
              public boolean markSupported()
              {
                  return false;
              }
              public int read() throws IOException
              {
                  if(offset >= size)
                  {
                      return -1;
                  }
                  int c = pageFileIS.read();
                  if(c != -1)
                  {
                      offset ++;
                  }
                  return c;
              }
  
              public int read(byte[] b) throws IOException
              {
                  int len = Math.min((int)(size-offset), b.length);
                  if(len > 0)
                  {
                      len = pageFileIS.read(b, 0, len);
                      if(len != -1)
                      {
                          offset += len;
                      }
                      return len;
                  }
                  return -1;
              }
              public int read(byte[] b, int off, int maxLen) throws IOException
              {
                  int len = Math.min(Math.min((int)(size-offset), b.length), maxLen);
                  if(len > 0)
                  {
                      len = pageFileIS.read(b, off, maxLen);
                      if(len != -1)
                      {
                          offset += len;
                      }
                      return len;
                  }
                  return -1;
              }
              public long skip(long n) throws IOException
              {
                  n = Math.min(n, size-offset);
                  n = pageFileIS.skip(n);
                  if(n > 0)
                  {
                      offset+=n;
                  }
                  return n;
              }
  
  
  
          }
  
          public PageFileEntry(String storeLogLine, File pageFileDirectory)
          {
              String column=null;
              SimpleStringTokenizer t = new SimpleStringTokenizer(storeLogLine, '\t');
              try
              {
  
                  hasPageFileEntry = false;
                  t.nextToken();
                  url = t.nextToken();
                  column = "isFrame";
                  isFrame = Integer.parseInt(t.nextToken());
                  t.nextToken(); // anchor
                  column = "resultCode";
                  resultCode = Integer.parseInt(t.nextToken());
                  mimeType = t.nextToken();
                  column = "size";
                  size = Integer.parseInt(t.nextToken());
                  title = t.nextToken();
                  if(size > 0)
                  {
                      column = "pageFileNo";
                      pageFileNo = Integer.parseInt(t.nextToken());
                      column = "pageFileOffset";
                      pageFileOffset = Integer.parseInt(t.nextToken());
                      this.pageFileDirectory = pageFileDirectory;
                      hasPageFileEntry = true;
                  }
              }
              catch(NumberFormatException e) // possibly tab characters in title. ignore
              {
                  //System.out.println(e + " at " + url + " in column " + column);
              }
          }
  
          public InputStream getInputStream()  throws IOException
          {
              if(hasPageFileEntry)
              {
                  return new PageFileInputStream();
              }
              else return null;
          }
  
      }
  
      BufferedReader reader;
      boolean isOpen = false;
      File storeLog;
  
      /**
       *
       * @param storeLog location of store.log from LogStorage. pagefile_xy.pfl
       * must be in the same directory
       * @throws IOException
       */
      public StoreLogFile(File storeLog) throws IOException
      {
          this.storeLog = storeLog;
           reader = new BufferedReader(new FileReader(storeLog));
           isOpen = true; // unless exception
  
      }
  
      public boolean hasNext()
      {
          try
          {
              reader.mark(1000);
              if(reader.readLine() != null)
              {
                  reader.reset();
                  return true;
              }
              else
              {
                  return false;
              }
          }
          catch(IOException e)
          {
              throw new RuntimeException("IOException occured");
          }
      }
  
      /**
       * @return a StoreLogFile.PageFileEntry with the current file
       * @throws IOException
       */
      public Object next()
      {
          try
          {
              return new PageFileEntry(reader.readLine(), storeLog.getParentFile());
          }
          catch(IOException e)
          {
              throw new RuntimeException("IOException occured");
          }
      }
  
  
  
  
  //    static SimpleLogger log;
  //    static PageFileEntry entry;
  //    static ArrayList foundURLs;
  //    static URL base;
  //    static URL contextUrl;
  //
  //    static void test1(StoreLogFile store) throws IOException
  //    {
  //        while(store.hasNext())
  //        {
  //            PageFileEntry entry = store.next();
  //            if(entry.mimeType.equals("text/plain") && entry.hasPageFileEntry)
  //            {
  //                BufferedReader r = new BufferedReader(new InputStreamReader(entry.getInputStream()));
  //                String l;
  //                while((l = r.readLine()) != null)
  //                {
  //                    System.out.println(entry.url + " >> " + l);
  //                }
  //                r.close();
  //            }
  //            //System.out.println(entry.title);
  //        }
  //    }
  //    static void test2(StoreLogFile store) throws Exception
  //    {
  //        MessageHandler msgH = new MessageHandler();
  //        log = new SimpleLogger("errors.log");
  //        msgH.addListener(new URLVisitedFilter(log, 100000));
  //        final de.lanlab.larm.net.HostManager hm = new de.lanlab.larm.net.HostManager(1000);
  //        hm.setHostResolver(new HostResolver());
  //
  //        while(store.hasNext())
  //        {
  //            entry = store.next();
  //            foundURLs = new ArrayList();
  //            if(entry.mimeType.startsWith("text/html") && entry.hasPageFileEntry)
  //            {
  //                Tokenizer t = new Tokenizer();
  //                base = new URL(entry.url);
  //                contextUrl = new URL(entry.url);
  //
  //                t.setLinkHandler(new LinkHandler()
  //                {
  //
  //                    public void handleLink(String link, String anchor, boolean isFrame)
  //                    {
  //                        try
  //                        {
  //                            // cut out Ref part
  //
  //
  //                            int refPart = link.indexOf("#");
  //                            //System.out.println(link);
  //                            if (refPart == 0)
  //                            {
  //                                return;
  //                            }
  //                            else if (refPart > 0)
  //                            {
  //                                link = link.substring(0, refPart);
  //                            }
  //
  //                            URL url = null;
  //                            if (link.startsWith("http:"))
  //                            {
  //                                // distinguish between absolute and relative URLs
  //
  //                                url = new URL(link);
  //                            }
  //                            else
  //                            {
  //                                // relative url
  //                                url = new URL(base, link);
  //                            }
  //
  //                            URLMessage urlMessage =  new URLMessage(url, contextUrl, isFrame
? URLMessage.LINKTYPE_FRAME : URLMessage.LINKTYPE_ANCHOR, anchor, hm.getHostResolver());
  //
  //                            String urlString = urlMessage.getURLString();
  //
  //                            foundURLs.add(urlMessage);
  //                            //messageHandler.putMessage(new actURLMessage(url)); // put
them in the very end
  //                        }
  //                        catch (MalformedURLException e)
  //                        {
  //                            //log.log("malformed url: base:" + base + " -+- link:" + link);
  //                            log.log("warning: " + e.getClass().getName() + ": " + e.getMessage());
  //                        }
  //                        catch (Exception e)
  //                        {
  //                            log.log("warning: " + e.getClass().getName() + ": " + e.getMessage());
  //                            // e.printStackTrace();
  //                        }
  //
  //                    }
  //
  //
  //                    /**
  //                     * called when a BASE tag was found
  //                     *
  //                     * @param base  the HREF attribute
  //                     */
  //                    public void handleBase(String baseString)
  //                    {
  //                        try
  //                        {
  //                            base = new URL(baseString);
  //                        }
  //                        catch (MalformedURLException e)
  //                        {
  //                            log.log("warning: " + e.getClass().getName() + ": " + e.getMessage()
+ " while converting '" + base + "' to URL in document " + contextUrl);
  //                        }
  //                    }
  //
  //                    public void handleTitle(String value)
  //                    {}
  //
  //
  //                });
  //                t.parse(new BufferedReader(new InputStreamReader(entry.getInputStream())));
  //                msgH.putMessages(foundURLs);
  //            }
  //
  //        }
  //
  //    }
  //
  //    public static void main(String[] args) throws Exception
  //    {
  //        StoreLogFile store = new StoreLogFile(new File("c:/java/jakarta-lucene-sandbox/contributions/webcrawler-LARM/logs/store.log"));
  //        test2(store);
  //    }
  
  }
  
  
  
  

--
To unsubscribe, e-mail:   <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>


Mime
View raw message