lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From cmarsch...@apache.org
Subject cvs commit: jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net HostInfo.java HostManager.java URLNormalizer.java
Date Mon, 17 Jun 2002 14:00:14 GMT
cmarschner    2002/06/17 07:00:14

  Added:       contributions/webcrawler-LARM/src/de/lanlab/larm/net
                        HostInfo.java HostManager.java URLNormalizer.java
  Log:
  moved HostInfo/HostManager to larm.net package; added URLNormalizer
  
  Revision  Changes    Path
  1.1                  jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HostInfo.java
  
  Index: HostInfo.java
  ===================================================================
  /*
   *  ====================================================================
   *  The Apache Software License, Version 1.1
   *
   *  Copyright (c) 2001 The Apache Software Foundation.  All rights
   *  reserved.
   *
   *  Redistribution and use in source and binary forms, with or without
   *  modification, are permitted provided that the following conditions
   *  are met:
   *
   *  1. Redistributions of source code must retain the above copyright
   *  notice, this list of conditions and the following disclaimer.
   *
   *  2. Redistributions in binary form must reproduce the above copyright
   *  notice, this list of conditions and the following disclaimer in
   *  the documentation and/or other materials provided with the
   *  distribution.
   *
   *  3. The end-user documentation included with the redistribution,
   *  if any, must include the following acknowledgment:
   *  "This product includes software developed by the
   *  Apache Software Foundation (http://www.apache.org/)."
   *  Alternately, this acknowledgment may appear in the software itself,
   *  if and wherever such third-party acknowledgments normally appear.
   *
   *  4. The names "Apache" and "Apache Software Foundation" and
   *  "Apache Lucene" must not be used to endorse or promote products
   *  derived from this software without prior written permission. For
   *  written permission, please contact apache@apache.org.
   *
   *  5. Products derived from this software may not be called "Apache",
   *  "Apache Lucene", nor may "Apache" appear in their name, without
   *  prior written permission of the Apache Software Foundation.
   *
   *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   *  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   *  DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   *  ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   *  USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   *  OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   *  SUCH DAMAGE.
   *  ====================================================================
   *
   *  This software consists of voluntary contributions made by many
   *  individuals on behalf of the Apache Software Foundation.  For more
   *  information on the Apache Software Foundation, please see
   *  <http://www.apache.org/>.
   */
  package de.lanlab.larm.net;
  
  import java.util.HashMap;
  import java.net.*;
  import de.lanlab.larm.util.CachingQueue;
  import de.lanlab.larm.util.Queue;
  import java.util.LinkedList;
  import de.lanlab.larm.fetcher.Message;
  
  /**
   * contains information about a host. If a host doesn't respond too often, it's
   * excluded from the crawl. This class is used by the HostManager
   *
   * @author    Clemens Marschner
   * @created   16. Februar 2002
   * @version   $Id: HostInfo.java,v 1.1 2002/06/17 14:00:13 cmarschner Exp $
   */
  public class HostInfo
  {
      final static String[] emptyKeepOutDirectories = new String[0];
  
      int id;
  
      int healthyCount = 5;
  
      // five strikes, and you're out
      boolean isReachable = true;
  
      boolean robotTxtChecked = false;
  
      String[] disallows;
  
      // robot exclusion
      boolean isLoadingRobotsTxt = false;
  
      Queue queuedRequests = null;
  
      // robot exclusion
      String hostName;
  
  
      /**
       * Description of the Method
       */
      public void removeQueue()
      {
          queuedRequests = null;
      }
  
  
      /**
       * Gets the id attribute of the HostInfo object
       *
       * @return   The id value
       */
      public int getId()
      {
          return id;
      }
  
  
      /**
       * Description of the Method
       *
       * @param message  Description of the Parameter
       */
      public void insertIntoQueue(Message message)
      {
          queuedRequests.insert(message);
      }
  
  
      /**
       * Gets the hostName attribute of the HostInfo object
       *
       * @return   The hostName value
       */
      public String getHostName()
      {
          return hostName;
      }
  
  
      /**
       * Gets the queueSize. No error checking is done when the queue is null
       *
       * @return   The queueSize value
       */
      public int getQueueSize()
      {
          return queuedRequests.size();
      }
  
  
      /**
       * gets last entry from queue. No error checking is done when the queue is null
       *
       * @return   Description of the Return Value
       */
      public Message removeFromQueue()
      {
          return (Message) queuedRequests.remove();
      }
  
  
      //LinkedList synonyms = new LinkedList();
  
      /**
       * Constructor for the HostInfo object
       *
       * @param hostName  Description of the Parameter
       * @param id        Description of the Parameter
       */
      public HostInfo(String hostName, int id)
      {
          this.id = id;
          this.disallows = HostInfo.emptyKeepOutDirectories;
          this.hostName = hostName;
      }
  
  
      /**
       * is this host reachable and responding?
       *
       * @return   The healthy value
       */
      public boolean isHealthy()
      {
          return (healthyCount > 0) && isReachable;
      }
  
  
      /**
       * signals that the host returned with a bad request of whatever type
       */
      public void badRequest()
      {
          healthyCount--;
      }
  
  
      /**
       * Sets the reachable attribute of the HostInfo object
       *
       * @param reachable  The new reachable value
       */
      public void setReachable(boolean reachable)
      {
          isReachable = reachable;
      }
  
  
      /**
       * Gets the reachable attribute of the HostInfo object
       *
       * @return   The reachable value
       */
      public boolean isReachable()
      {
          return isReachable;
      }
  
  
      /**
       * Gets the robotTxtChecked attribute of the HostInfo object
       *
       * @return   The robotTxtChecked value
       */
      public boolean isRobotTxtChecked()
      {
          return robotTxtChecked;
      }
  
  
      /**
       * must be synchronized externally
       *
       * @return   The loadingRobotsTxt value
       */
      public boolean isLoadingRobotsTxt()
      {
          return this.isLoadingRobotsTxt;
      }
  
  
      /**
       * Sets the loadingRobotsTxt attribute of the HostInfo object
       *
       * @param isLoading  The new loadingRobotsTxt value
       */
      public void setLoadingRobotsTxt(boolean isLoading)
      {
          this.isLoadingRobotsTxt = isLoading;
          if (isLoading)
          {
              this.queuedRequests = new CachingQueue("HostInfo_" + id + "_QueuedRequests",
100);
          }
  
      }
  
  
      /**
       * Sets the robotsChecked attribute of the HostInfo object
       *
       * @param isChecked  The new robotsChecked value
       * @param disallows  The new robotsChecked value
       */
      public void setRobotsChecked(boolean isChecked, String[] disallows)
      {
          this.robotTxtChecked = isChecked;
          if (disallows != null)
          {
              this.disallows = disallows;
          }
          else
          {
              this.disallows = emptyKeepOutDirectories;
          }
  
      }
  
  
      /**
       * Gets the allowed attribute of the HostInfo object
       *
       * @param path  Description of the Parameter
       * @return      The allowed value
       */
      public synchronized boolean isAllowed(String path)
      {
          // assume keepOutDirectories is pretty short
          // assert disallows != null
          int length = disallows.length;
          for (int i = 0; i < length; i++)
          {
              if (path.startsWith(disallows[i]))
              {
                  return false;
              }
          }
          return true;
      }
  
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HostManager.java
  
  Index: HostManager.java
  ===================================================================
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  package de.lanlab.larm.net;
  
  import java.util.HashMap;
  
  /**
   * Description of the Class
   *
   * @author    Administrator
   * @created   16. Februar 2002
   * @version $Id: HostManager.java,v 1.1 2002/06/17 14:00:13 cmarschner Exp $
   */
  public class HostManager
  {
      HashMap hosts;
      static int hostCount = 0;
  
  
      /**
       * Constructor for the HostInfo object
       *
       * @param initialSize  Description of the Parameter
       */
      public HostManager(int initialCapacity)
      {
          hosts = new HashMap(initialCapacity);
      }
  
  
      /**
       * Description of the Method
       *
       * @param hostName  Description of the Parameter
       * @return          Description of the Return Value
       */
      public HostInfo put(String hostName)
      {
          if (!hosts.containsKey(hostName))
          {
              int hostID;
              synchronized (this)
              {
                  hostID = hostCount++;
              }
              HostInfo hi = new HostInfo(hostName,hostID);
              hosts.put(hostName, hi);
              //System.out.println("hostManager: + " + hostName);
              if(!hostName.equals(hostName.toLowerCase()))
              {
                  try
                  {
                      throw new Exception();
                  }
                  catch(Exception e)
                  {
                      e.printStackTrace();
                  }
              }
              return hi;
          }
          return (HostInfo)hosts.get(hostName);
          /*else
          {
              hostID = hosts.get()
          }
          // assert hostID != -1;
          return hostID;*/
  
      }
  
  
      /**
       * Gets the hostID attribute of the HostInfo object
       *
       * @param hostName  Description of the Parameter
       * @return          The hostID value
       */
      public HostInfo getHostInfo(String hostName)
      {
          HostInfo hi = (HostInfo)hosts.get(hostName);
          if(hi == null)
          {
              return put(hostName);
          }
          return hi;
      }
  
      public int getSize()
      {
         return hosts.size();
      }
  
      public HostInfo addSynonym(String hostName, String synonym)
      {
          HostInfo info = getHostInfo(hostName);
          hosts.put(synonym, info);
          return info;
      }
  
  
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/URLNormalizer.java
  
  Index: URLNormalizer.java
  ===================================================================
  package de.lanlab.larm.net;
  /*
   *  ====================================================================
   *  The Apache Software License, Version 1.1
   *
   *  Copyright (c) 2001 The Apache Software Foundation.  All rights
   *  reserved.
   *
   *  Redistribution and use in source and binary forms, with or without
   *  modification, are permitted provided that the following conditions
   *  are met:
   *
   *  1. Redistributions of source code must retain the above copyright
   *  notice, this list of conditions and the following disclaimer.
   *
   *  2. Redistributions in binary form must reproduce the above copyright
   *  notice, this list of conditions and the following disclaimer in
   *  the documentation and/or other materials provided with the
   *  distribution.
   *
   *  3. The end-user documentation included with the redistribution,
   *  if any, must include the following acknowledgment:
   *  "This product includes software developed by the
   *  Apache Software Foundation (http://www.apache.org/)."
   *  Alternately, this acknowledgment may appear in the software itself,
   *  if and wherever such third-party acknowledgments normally appear.
   *
   *  4. The names "Apache" and "Apache Software Foundation" and
   *  "Apache Lucene" must not be used to endorse or promote products
   *  derived from this software without prior written permission. For
   *  written permission, please contact apache@apache.org.
   *
   *  5. Products derived from this software may not be called "Apache",
   *  "Apache Lucene", nor may "Apache" appear in their name, without
   *  prior written permission of the Apache Software Foundation.
   *
   *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   *  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   *  DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   *  ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   *  USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   *  OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   *  SUCH DAMAGE.
   *  ====================================================================
   *
   *  This software consists of voluntary contributions made by many
   *  individuals on behalf of the Apache Software Foundation.  For more
   *  information on the Apache Software Foundation, please see
   *  <http://www.apache.org/>.
   */
  import java.io.*;
  import java.net.*;
  
  
  /**
   * Description of the Class
   *
   * @author    Administrator
   * @created   14. Juni 2002
   */
  public class URLNormalizer
  {
      final static int NP_SLASH = 1;
      final static int NP_CHAR = 2;
      final static int NP_PERCENT = 3;
      final static int NP_POINT = 4;
      final static int NP_HEX = 5;
  
      /**
       * contains hex codes for characters in lowercase uses char arrays instead
       * of strings for faster processing
       */
      protected static char[][] charMap = {
              {'%', '0', '0'}, {'%', '0', '1'}, {'%', '0', '2'}, {'%', '0', '3'}, {'%', '0',
'4'}, {'%', '0', '5'}, {'%', '0', '6'}, {'%', '0', '7'}, {'%', '0', '8'}, {'%', '0', '9'},
{'%', '0', 'A'}, {'%', '0', 'B'}, {'%', '0', 'C'}, {'%', '0', 'D'}, {'%', '0', 'E'}, {'%',
'0', 'F'},
              {'%', '1', '0'}, {'%', '1', '1'}, {'%', '1', '2'}, {'%', '1', '3'}, {'%', '1',
'4'}, {'%', '1', '5'}, {'%', '1', '6'}, {'%', '1', '7'}, {'%', '1', '8'}, {'%', '1', '9'},
{'%', '1', 'A'}, {'%', '1', 'B'}, {'%', '1', 'C'}, {'%', '1', 'D'}, {'%', '1', 'E'}, {'%',
'1', 'F'},
              {'%', '2', '0'}, {'%', '2', '1'}, {'%', '2', '2'}, {'%', '2', '3'}, {'$'}, {'%',
'2', '5'}, {'%', '2', '6'}, {'%', '2', '7'}, {'%', '2', '8'}, {'%', '2', '9'}, {'%', '2',
'A'}, {'%', '2', 'B'}, {'%', '2', 'C'}, {'-'}, {'.'}, {'%', '2', 'F'},
              {'0'}, {'1'}, {'2'}, {'3'}, {'4'}, {'5'}, {'6'}, {'7'}, {'8'}, {'9'}, {'%',
'3', 'A'}, {'%', '3', 'B'}, {'%', '3', 'C'}, {'%', '3', 'D'}, {'%', '3', 'E'}, {'%', '3',
'F'},
              {'%', '4', '0'}, {'a'}, {'b'}, {'c'}, {'d'}, {'e'}, {'f'}, {'g'}, {'h'}, {'i'},
{'j'}, {'k'}, {'l'}, {'m'}, {'n'}, {'o'},
              {'p'}, {'q'}, {'r'}, {'s'}, {'t'}, {'u'}, {'v'}, {'w'}, {'x'}, {'y'}, {'z'},
{'%', '5', 'B'}, {'%', '5', 'C'}, {'%', '5', 'D'}, {'%', '5', 'E'}, {'_'},
              {'%', '6', '0'}, {'a'}, {'b'}, {'c'}, {'d'}, {'e'}, {'f'}, {'g'}, {'h'}, {'i'},
{'j'}, {'k'}, {'l'}, {'m'}, {'n'}, {'o'},
              {'p'}, {'q'}, {'r'}, {'s'}, {'t'}, {'u'}, {'v'}, {'w'}, {'x'}, {'y'}, {'z'},
{'%', '7', 'B'}, {'%', '7', 'C'}, {'%', '7', 'D'}, {'%', '7', 'E'}, {'%', '7', 'F'},
              {'%', '8', '0'}, {'%', '8', '1'}, {'%', '8', '2'}, {'%', '8', '3'}, {'%', '8',
'4'}, {'%', '8', '5'}, {'%', '8', '6'}, {'%', '8', '7'}, {'%', '8', '8'}, {'%', '8', '9'},
{'%', '8', 'A'}, {'%', '8', 'B'}, {'%', '8', 'C'}, {'%', '8', 'D'}, {'%', '8', 'E'}, {'%',
'8', 'F'},
              {'%', '9', '0'}, {'%', '9', '1'}, {'%', '9', '2'}, {'%', '9', '3'}, {'%', '9',
'4'}, {'%', '9', '5'}, {'%', '9', '6'}, {'%', '9', '7'}, {'%', '9', '8'}, {'%', '9', '9'},
{'%', '9', 'A'}, {'%', '9', 'B'}, {'%', '9', 'C'}, {'%', '9', 'D'}, {'%', '9', 'E'}, {'%',
'9', 'F'},
              {'%', 'A', '0'}, {'%', 'A', '1'}, {'%', 'A', '2'}, {'%', 'A', '3'}, {'%', 'A',
'4'}, {'%', 'A', '5'}, {'%', 'A', '6'}, {'%', 'A', '7'}, {'%', 'A', '8'}, {'%', 'A', '9'},
{'%', 'A', 'A'}, {'%', 'A', 'B'}, {'%', 'A', 'C'}, {'%', 'A', 'D'}, {'%', 'A', 'E'}, {'%',
'A', 'F'},
              {'%', 'B', '0'}, {'%', 'B', '1'}, {'%', 'B', '2'}, {'%', 'B', '3'}, {'%', 'B',
'4'}, {'%', 'B', '5'}, {'%', 'B', '6'}, {'%', 'B', '7'}, {'%', 'B', '8'}, {'%', 'B', '9'},
{'%', 'B', 'A'}, {'%', 'B', 'B'}, {'%', 'B', 'C'}, {'%', 'B', 'D'}, {'%', 'B', 'E'}, {'%',
'B', 'F'},
              {'%', 'E', '0'}, {'%', 'E', '1'}, {'%', 'E', '2'}, {'%', 'E', '3'}, {'%', 'E',
'4'}, {'%', 'E', '5'}, {'%', 'E', '6'}, {'%', 'E', '7'}, {'%', 'E', '8'}, {'%', 'E', '9'},
{'%', 'E', 'A'}, {'%', 'E', 'B'}, {'%', 'E', 'C'}, {'%', 'E', 'D'}, {'%', 'E', 'E'}, {'%',
'E', 'F'},
              {'%', 'F', '0'}, {'%', 'F', '1'}, {'%', 'F', '2'}, {'%', 'F', '3'}, {'%', 'F',
'4'}, {'%', 'F', '5'}, {'%', 'F', '6'}, {'%', 'D', '7'}, {'%', 'F', '8'}, {'%', 'F', '9'},
{'%', 'F', 'A'}, {'%', 'F', 'B'}, {'%', 'F', 'C'}, {'%', 'F', 'D'}, {'%', 'F', 'E'}, {'%',
'D', 'F'},
              {'%', 'E', '0'}, {'%', 'E', '1'}, {'%', 'E', '2'}, {'%', 'E', '3'}, {'%', 'E',
'4'}, {'%', 'E', '5'}, {'%', 'E', '6'}, {'%', 'E', '7'}, {'%', 'E', '8'}, {'%', 'E', '9'},
{'%', 'E', 'A'}, {'%', 'E', 'B'}, {'%', 'E', 'C'}, {'%', 'E', 'D'}, {'%', 'E', 'E'}, {'%',
'E', 'F'},
              {'%', 'F', '0'}, {'%', 'F', '1'}, {'%', 'F', '2'}, {'%', 'F', '3'}, {'%', 'F',
'4'}, {'%', 'F', '5'}, {'%', 'F', '6'}, {'%', 'F', '7'}, {'%', 'F', '8'}, {'%', 'F', '9'},
{'%', 'F', 'A'}, {'%', 'F', 'B'}, {'%', 'F', 'C'}, {'%', 'F', 'D'}, {'%', 'F', 'E'}, {'%',
'F', 'F'},
              };
  
  
      /**
       * Description of the Method
       *
       * @param path             Description of the Parameter
       * @return                 Description of the Return Value
       * @exception IOException  Description of the Exception
       */
      protected static String normalizePath(String path)
          throws IOException
      {
          // rule 1: if the path is empty, return "/"
          if (path.length() == 0)
          {
              return "/";
          }
  
          // Finite State Machine to convert characters to lowercase, remove "//" and "/./"
          // and make sure that all characters are escaped in a uniform way, i.e.
          // {" ", "+", "%20"} -> "%20"
  
          StringBuffer w = new StringBuffer((int) (path.length() * 1.5));
  
          int status = NP_CHAR;
  
          int pos = 0;
          int length = path.length();
          char savedChar = '?';
          int hexChar = '?';
          int pathPos = -1;    // position of last "/"
          int questionPos = -1; // assert length >0
          boolean isInQuery = false;  // question mark reached?
  
          while (pos < length)
          {
              char c = path.charAt(pos++);
              try
              {
                  switch (status)
                  {
                      case NP_SLASH:
                          if (c == '/')
                          {
                              // ignore subsequent slashes
                          }
                          else if (c == '.')
                          {
                              status = NP_POINT;
                          }
                          else if (c == '%')
                          {
                              status = NP_PERCENT;
                          }
                          else
                          {
                              pos--;
                              status = NP_CHAR;
                          }
                          break;
                      case NP_POINT:
                          if (c == '/')
                          {
                              // ignore
                          }
                          else if (c == '.')
                          {
                              // ignore; this shouldn't happen
                          }
                          else
                          {
                              w.append('.');
                              pos--;
                              status = NP_SLASH;
                          }
                          break;
                      case NP_PERCENT:
                          if (c >= '0' && c <= '9')
                          {
                              hexChar = (c - '0') << 4;
                          }
                          else if (c >= 'a' && c <= 'f')
                          {
                              hexChar = (c - 'a' + 10) << 4;
                          }
                          else if (c >= 'A' && c <= 'F')
                          {
                              hexChar = (c - 'A' + 10) << 4;
                          }
                          else
                          {
                              w.append(charMap['%']);
                              w.append(charMap[c]);
                              break;
                          }
                          savedChar = c;
                          status = NP_HEX;
                          break;
                      case NP_HEX:
                          if (c >= '0' && c <= '9')
                          {
                              hexChar |= (c - '0');
                          }
                          else if (c >= 'a' && c <= 'f')
                          {
                              hexChar |= (c - 'a' + 10);
                          }
                          else if (c >= 'A' && c <= 'F')
                          {
                              hexChar |= (c - 'A' + 10);
                          }
                          else
                          {
                              w.append(charMap['%']);
                              w.append(charMap[savedChar]);
                              w.append(charMap[c]);
                              break;
                          }
                          w.append(charMap[hexChar]);
                          status = NP_CHAR;
                          break;
                      case NP_CHAR:
                          switch (c)
                          {
                              case '%':
                                  status = NP_PERCENT;
                                  break;
                              case '/':
                                  if(!isInQuery)
                                  {
                                      w.append(c);
                                      pathPos = w.length(); // points to the char. after "/"
                                      status = NP_SLASH;
                                  }
                                  else
                                  {
                                      w.append(charMap[c]);
                                  }
                                  break;
                              case '?':
                                  if(!isInQuery)
                                  {
                                      if(pathPos == -1)
                                      {
                                          w.append('/');
                                          pathPos = w.length();
                                      }
                                      questionPos = w.length(); // points to the char at "?"
                                      isInQuery = true;
                                  }
                                  else
                                  {
                                      w.append(charMap[c]);
                                      break;
                                  }
                              case '&':
                              case ';':
                              case '@':
                              //case ':':
                              case '=':
                                  w.append(c);
                                  break;
                              case '+':
                                  w.append("%20");
                                  break;
                              default:
                                  w.append(charMap[c]);
                                  break;
                          }
                  }
  
              }
              catch (ArrayIndexOutOfBoundsException e)
              {
                  // we encountered a unicode character >= 0x00ff
                  // write UTF-8 to distinguish it from other characters
                  // note that this does NOT lead to a pure UTF-8 URL since we
                  // write 0x80 <= c <= 0xff as one-byte strings
                  /*
                   *  if (ch <= 0x007f) {		// other ASCII
                   *  sbuf.append(hex[ch]);
                   *  } else
                   */
                  // note that we ignore the case that we receive "%" + unicode + c
                  // (status = NP_HEX + Exception when writing savedchar); in that case
                  // only the second character is written. we consider this to be very
                  // unlikely
  
                  // see http://www.w3.org/International/O-URL-code.html
                  if (c <= 0x07FF)
                  {
                      // non-ASCII <= 0x7FF
                      w.append(charMap[0xc0 | (c >> 6)]);
                      w.append(charMap[0x80 | (c & 0x3F)]);
                  }
                  else
                  {
                      // 0x7FF < c <= 0xFFFF
                      w.append(charMap[0xe0 | (c >> 12)]);
                      w.append(charMap[0x80 | ((c >> 6) & 0x3F)]);
                      w.append(charMap[0x80 | (c & 0x3F)]);
                  }
              }
          }
  
          // rule 3: delete index.* or default.*
  
          if(questionPos == -1) // no query
          {
              questionPos = w.length();
          }
          else
          {
              if(questionPos == w.length()-1)
              {
                  // empty query. assert questionPos > 0
                  w.deleteCharAt(questionPos);
              }
          }
          if(pathPos == -1) // no query
          {
              pathPos = 0;
          }
          if(questionPos > pathPos)
          {
              String file = w.substring(pathPos, questionPos);
              {
                  //System.out.println("file: " + file);
                  if(file.startsWith("index.") || file.startsWith("default."))
                  {
                      w.delete(pathPos, questionPos); // delete default page to avoid ambiguities
                  }
              }
          }
          return w.toString();
      }
  
  
      /**
       * Description of the Method
       *
       * @param host  Description of the Parameter
       * @return      Description of the Return Value
       */
      protected static String normalizeHost(HostManager hostManager, String host)
      {
          return hostManager.getHostInfo(host.toLowerCase()).getHostName();
      }
  
  /*
      HostManager hostManager;
  */
  
      /**
       * Constructor for the URLNormalizer object
       *
       * @param hostManager  Description of the Parameter
       */
     /* public URLNormalizer(HostManager hostManager)
      {
          this.hostManager = hostManager;
      }*/
  
  
      /**
       * Description of the Method
       *
       * @param u                          Description of the Parameter
       * @return                           Description of the Return Value
       * @exception IOException            Description of the Exception
       * @exception MalformedURLException  Description of the Exception
       */
      public static URL normalize(URL u, HostManager hostManager)
      {
          if (u.getProtocol().equals("http"))
          {
              try
              {
                  int port = u.getPort();
                  /*URL url =*/
                  return  new URL(u.getProtocol(), normalizeHost(hostManager, u.getHost()),
port == 80 ? -1 : port, normalizePath(u.getFile()));
                  /*if(!u.equals(url))
                  {
                      System.out.println(u.toExternalForm() + " -> " + url.toExternalForm());
                  }
                  return url;*/
              }
              catch(MalformedURLException e)
              {
                  System.out.println("assertion failed: MalformedURLException in URLNormalizer.normalize()");
                  throw new java.lang.InternalError("assertion failed: MalformedURLException
in URLNormalizer.normalize()");
              }
              catch(IOException e)
              {
                  System.out.println("assertion failed: IOException in URLNormalizer.normalize()");
                  throw new java.lang.InternalError("assertion failed: MalformedURLException
in URLNormalizer.normalize()");
              }
  
              //return url
          }
          else
          {
              return u;
          }
      }
  
      public static void main(String[] args) throws Exception
      {
          HostManager hm = new HostManager(10);
          hm.addSynonym("webinfo.campus.lmu.de", "webinfo.uni-muenchen.de");
          System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/conman/index.jsp?path=709"),
hm));
          System.out.println(URLNormalizer.normalize(new URL("http://webinfo.uni-muenchen.de/view-i.cfm?url=http://abc/resp?a=c"),
hm));
          System.out.println(URLNormalizer.normalize(new URL("http://webinfo.campus.lmu.de/view-i.cfm?url=http://abc/resp?a=c"),
hm));
          System.out.println(URLNormalizer.normalize(new URL("http://www.bwl.uni-muenchen.de/default.asp?id=123"),
hm));
          System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/index.html"),
hm));
          System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de"), hm));
          System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/"), hm));
          System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/?"), hm));
          System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de?"), hm));
          System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de?id=abc"),
hm));
          System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/abcde$1?id=abc"),
hm));
          URL u = new URL("http://www.lmu.de/abcde$1?id=abc");
          System.out.println("host: " + u.getHost());
          System.out.println("port: " + u.getPort());
          System.out.println(URLNormalizer.normalize(u, hm));
  
  
  
      }
  }
  
  
  

--
To unsubscribe, e-mail:   <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>


Mime
View raw message