lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From cmarsch...@apache.org
Subject cvs commit: jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net URLNormalizer.java
Date Tue, 22 Oct 2002 15:24:26 GMT
cmarschner    2002/10/22 08:24:26

  Modified:    contributions/webcrawler-LARM/src/de/lanlab/larm/net
                        URLNormalizer.java
  Log:
  changed hostManager stuff to hostResolver
  
  Revision  Changes    Path
  1.2       +26 -32    jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/URLNormalizer.java
  
  Index: URLNormalizer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/URLNormalizer.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- URLNormalizer.java	17 Jun 2002 14:00:13 -0000	1.1
  +++ URLNormalizer.java	22 Oct 2002 15:24:26 -0000	1.2
  @@ -55,6 +55,7 @@
    */
   import java.io.*;
   import java.net.*;
  +import org.apache.oro.text.perl.*;
   
   
   /**
  @@ -75,7 +76,7 @@
        * contains hex codes for characters in lowercase uses char arrays instead
        * of strings for faster processing
        */
  -    protected static char[][] charMap = {
  +    protected final static char[][] charMap = {
               {'%', '0', '0'}, {'%', '0', '1'}, {'%', '0', '2'}, {'%', '0', '3'}, {'%', '0',
'4'}, {'%', '0', '5'}, {'%', '0', '6'}, {'%', '0', '7'}, {'%', '0', '8'}, {'%', '0', '9'},
{'%', '0', 'A'}, {'%', '0', 'B'}, {'%', '0', 'C'}, {'%', '0', 'D'}, {'%', '0', 'E'}, {'%',
'0', 'F'},
               {'%', '1', '0'}, {'%', '1', '1'}, {'%', '1', '2'}, {'%', '1', '3'}, {'%', '1',
'4'}, {'%', '1', '5'}, {'%', '1', '6'}, {'%', '1', '7'}, {'%', '1', '8'}, {'%', '1', '9'},
{'%', '1', 'A'}, {'%', '1', 'B'}, {'%', '1', 'C'}, {'%', '1', 'D'}, {'%', '1', 'E'}, {'%',
'1', 'F'},
               {'%', '2', '0'}, {'%', '2', '1'}, {'%', '2', '2'}, {'%', '2', '3'}, {'$'},
{'%', '2', '5'}, {'%', '2', '6'}, {'%', '2', '7'}, {'%', '2', '8'}, {'%', '2', '9'}, {'%',
'2', 'A'}, {'%', '2', 'B'}, {'%', '2', 'C'}, {'-'}, {'.'}, {'%', '2', 'F'},
  @@ -337,25 +338,31 @@
        * @param host  Description of the Parameter
        * @return      Description of the Return Value
        */
  -    protected static String normalizeHost(HostManager hostManager, String host)
  +    protected static String normalizeHost(HostResolver hostResolver, String host)
       {
  -        return hostManager.getHostInfo(host.toLowerCase()).getHostName();
  +        return hostResolver.resolveHost(host.toLowerCase());
       }
   
  -/*
  -    HostManager hostManager;
  -*/
  +
  +
  +
  +    HostResolver hostResolver;
  +
   
       /**
        * Constructor for the URLNormalizer object
        *
        * @param hostManager  Description of the Parameter
        */
  -   /* public URLNormalizer(HostManager hostManager)
  +    public URLNormalizer(HostResolver hostResolver)
       {
  -        this.hostManager = hostManager;
  -    }*/
  +        this.hostResolver = hostResolver;
  +    }
   
  +    public void setHostResolver(HostResolver hostResolver)
  +    {
  +        this.hostResolver = hostResolver;
  +    }
   
       /**
        * Description of the Method
  @@ -365,15 +372,19 @@
        * @exception IOException            Description of the Exception
        * @exception MalformedURLException  Description of the Exception
        */
  -    public static URL normalize(URL u, HostManager hostManager)
  +    public static URL normalize(URL u, HostResolver hostResolver)
       {
  +        if(u == null)
  +        {
  +            return null;
  +        }
           if (u.getProtocol().equals("http"))
           {
               try
               {
                   int port = u.getPort();
                   /*URL url =*/
  -                return  new URL(u.getProtocol(), normalizeHost(hostManager, u.getHost()),
port == 80 ? -1 : port, normalizePath(u.getFile()));
  +                return  new URL(u.getProtocol(), normalizeHost(hostResolver, u.getHost()),
port == 80 ? -1 : port, normalizePath(u.getFile()));
                   /*if(!u.equals(url))
                   {
                       System.out.println(u.toExternalForm() + " -> " + url.toExternalForm());
  @@ -399,27 +410,10 @@
           }
       }
   
  -    public static void main(String[] args) throws Exception
  -    {
  -        HostManager hm = new HostManager(10);
  -        hm.addSynonym("webinfo.campus.lmu.de", "webinfo.uni-muenchen.de");
  -        System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/conman/index.jsp?path=709"),
hm));
  -        System.out.println(URLNormalizer.normalize(new URL("http://webinfo.uni-muenchen.de/view-i.cfm?url=http://abc/resp?a=c"),
hm));
  -        System.out.println(URLNormalizer.normalize(new URL("http://webinfo.campus.lmu.de/view-i.cfm?url=http://abc/resp?a=c"),
hm));
  -        System.out.println(URLNormalizer.normalize(new URL("http://www.bwl.uni-muenchen.de/default.asp?id=123"),
hm));
  -        System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/index.html"),
hm));
  -        System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de"), hm));
  -        System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/"), hm));
  -        System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/?"), hm));
  -        System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de?"), hm));
  -        System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de?id=abc"),
hm));
  -        System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/abcde$1?id=abc"),
hm));
  -        URL u = new URL("http://www.lmu.de/abcde$1?id=abc");
  -        System.out.println("host: " + u.getHost());
  -        System.out.println("port: " + u.getPort());
  -        System.out.println(URLNormalizer.normalize(u, hm));
  -
  -
   
  +    public URL normalize(URL u)
  +    {
  +        return this.normalize(u, hostResolver);
       }
  +
   }
  
  
  

--
To unsubscribe, e-mail:   <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>


Mime
View raw message