cocoon-cvs mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From upayav...@apache.org
Subject cvs commit: cocoon-2.1/src/java/org/apache/cocoon/util NetUtils.java
Date Thu, 18 Sep 2003 12:11:50 GMT
upayavira    2003/09/18 05:11:50

  Modified:    src/java/org/apache/cocoon Main.java
               src/java/org/apache/cocoon/bean BeanListener.java
                        CocoonBean.java Target.java
               src/java/org/apache/cocoon/bean/helpers
                        DelayedOutputStream.java OutputStreamListener.java
               src/java/org/apache/cocoon/util NetUtils.java
  Log:
  Added removeAuthentication() to NetUtils, removes FTP and HTTP authentication details from
a URI. Could do it with more protocols.
  Added page skipping reporting (link preceded with ^)
  Added facility to only crawl certain extensions (this didn't give any hoped for speed improvement,
but I might as well leave it there, as some might want to crawl HTML but not PDFs)
  Prevented link gathering when running in link view mode
  Added code to bean to report time taken and page size
  Improved layout of console reporting of CLI (now shows links found, time taken, page size
and URI, all in nice columns)
  
  Revision  Changes    Path
  1.16      +13 -1     cocoon-2.1/src/java/org/apache/cocoon/Main.java
  
  Index: Main.java
  ===================================================================
  RCS file: /home/cvs/cocoon-2.1/src/java/org/apache/cocoon/Main.java,v
  retrieving revision 1.15
  retrieving revision 1.16
  diff -u -r1.15 -r1.16
  --- Main.java	17 Sep 2003 01:13:44 -0000	1.15
  +++ Main.java	18 Sep 2003 12:11:49 -0000	1.16
  @@ -162,6 +162,9 @@
       private static final String NODE_EXCLUDE = "exclude";
       private static final String ATTR_INCLUDE_EXCLUDE_PATTERN = "pattern";
       
  +    private static final String NODE_INCLUDE_LINKS = "include-links";
  +    private static final String ATTR_LINK_EXTENSION = "extension";
  +    
       private static final String NODE_URI = "uri";
       private static final String ATTR_URI_TYPE = "type";
       private static final String ATTR_URI_SOURCEPREFIX = "src-prefix";
  @@ -501,6 +504,9 @@
                           String pattern = Main.parseIncludeExcludeNode(cocoon, node, NODE_EXCLUDE);
                           cocoon.addExcludePattern(pattern);
   
  +                    } else if (nodeName.equals(NODE_INCLUDE_LINKS)) {
  +                        Main.parseIncludeLinksNode(cocoon, node);
  +
                       } else if (nodeName.equals(NODE_URI)) {
                           Main.parseURINode(cocoon, node, destDir);
   
  @@ -532,6 +538,12 @@
           NodeList nodes = node.getChildNodes();
           if (nodes.getLength()!=0) {
               throw new IllegalArgumentException("Unexpected children of <" + NODE_LOGGING
+ "> node");
  +        }
  +    }
  +
  +    private static void parseIncludeLinksNode(CocoonBean cocoon, Node node) throws IllegalArgumentException
{
  +        if (Main.hasAttribute(node, ATTR_LINK_EXTENSION)) {
  +            cocoon.addIncludeLinkExtension(Main.getAttributeValue(node, ATTR_LINK_EXTENSION));
           }
       }
   
  
  
  
  1.3       +3 -3      cocoon-2.1/src/java/org/apache/cocoon/bean/BeanListener.java
  
  Index: BeanListener.java
  ===================================================================
  RCS file: /home/cvs/cocoon-2.1/src/java/org/apache/cocoon/bean/BeanListener.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- BeanListener.java	15 Sep 2003 19:18:17 -0000	1.2
  +++ BeanListener.java	18 Sep 2003 12:11:50 -0000	1.3
  @@ -71,7 +71,7 @@
                                 int linksInPage, 
                                 int newLinksinPage, 
                                 int pagesRemaining, 
  -                              int pageComplete, 
  +                              int pagesComplete, 
                                 long timeTaken);
                                 
       /**
  @@ -79,7 +79,7 @@
        * include/exclude pattern.
        * @param msg            
        */
  -    public void pageSkipped(String uri);
  +    public void pageSkipped(String uri, String message);
       
       /**
        * Report a general message about operation of the bean
  
  
  
  1.27      +68 -17    cocoon-2.1/src/java/org/apache/cocoon/bean/CocoonBean.java
  
  Index: CocoonBean.java
  ===================================================================
  RCS file: /home/cvs/cocoon-2.1/src/java/org/apache/cocoon/bean/CocoonBean.java,v
  retrieving revision 1.26
  retrieving revision 1.27
  diff -u -r1.26 -r1.27
  --- CocoonBean.java	15 Sep 2003 19:18:17 -0000	1.26
  +++ CocoonBean.java	18 Sep 2003 12:11:50 -0000	1.27
  @@ -100,7 +100,8 @@
       private String brokenLinkExtension = "";
       private List excludePatterns = new ArrayList();
       private List includePatterns = new ArrayList();
  -
  +    private List includeLinkExtensions = null;
  +    
       // Internal Objects
       private Map allProcessedLinks;
       private Map allTranslatedLinks;
  @@ -223,15 +224,36 @@
           includePatterns.add(preparedPattern);
       }
   
  +    public void addIncludeLinkExtension(String extension) {
  +        if (includeLinkExtensions == null) {
  +            includeLinkExtensions = new ArrayList();
  +        }
  +        includeLinkExtensions.add(extension);
  +    }
  +    
       public void addListener(BeanListener listener) {
           this.listeners.add(listener);
       }
   
  -    public void pageGenerated(String uri, int linksInPage, int pagesRemaining) {
  +    public void pageGenerated(String sourceURI, 
  +                              String destURI, 
  +                              int pageSize, 
  +                              int linksInPage, 
  +                              int newLinksInPage, 
  +                              int pagesRemaining, 
  +                              int pagesComplete, 
  +                              long timeTaken) {
           Iterator i = listeners.iterator();
           while (i.hasNext()) {
               BeanListener l = (BeanListener) i.next();
  -            l.pageGenerated(uri, "", 0, linksInPage, 0, pagesRemaining, 0, 0L);
  +            l.pageGenerated(sourceURI, 
  +                            destURI, 
  +                            pageSize, 
  +                            linksInPage, 
  +                            newLinksInPage,
  +                            pagesRemaining,
  +                            pagesComplete,
  +                            timeTaken);
           }
       }
   
  @@ -259,6 +281,14 @@
           }
       }
   
  +    public void pageSkipped(String uri, String message) {
  +        Iterator i = listeners.iterator();
  +        while (i.hasNext()) {
  +            BeanListener l = (BeanListener) i.next();
  +            l.pageSkipped(uri, message);
  +        }
  +    }
  +
       public void dispose() {
           if (this.initialized) {
               if (this.sourceResolver != null) {
  @@ -371,6 +401,10 @@
           int status = 0;
           
           int linkCount = 0;
  +        int newLinkCount = 0;
  +        int pageSize = 0;
  +        
  +        long startTimeMillis = System.currentTimeMillis();
   
           if (confirmExtension) {
               if (null == allTranslatedLinks.get(target.getSourceURI())) {
  @@ -389,9 +423,8 @@
   
           // Process links
           final HashMap translatedLinks = new HashMap();
  -        List gatheredLinks = new ArrayList();
           final List targets = new ArrayList();
  -        if (followLinks && confirmExtension) {
  +        if (followLinks && confirmExtension && isCrawlablePage(target))
{
               final Iterator i =
                   this.getLinks(target.getDeparameterizedSourceURI(), target.getParameters()).iterator();
   
  @@ -400,13 +433,12 @@
                   Target linkTarget = target.getDerivedTarget(linkURI);
   
                   if (linkTarget == null) {
  -                    System.out.println("Skipping "+ linkURI);
  -                    //@TODO@ Log/report skipped link
  +                    pageSkipped(linkURI, "link does not share same root as parent");
                       continue;
                   }
   
                   if (!isIncluded(linkTarget.getSourceURI())) {
  -                    //@TODO@ Log/report skipped link
  +                    pageSkipped(linkTarget.getSourceURI(), "matched include/exclude rules");
                       continue;
                   }
   
  @@ -436,6 +468,13 @@
               // Process URI
               DelayedOutputStream output = new DelayedOutputStream();
               try {
  +                List gatheredLinks;
  +                if (!confirmExtension && followLinks && isCrawlablePage(target))
{
  +                    gatheredLinks = new ArrayList();
  +                } else {
  +                    gatheredLinks = null;
  +                }
  +        
                   status =
                       getPage(
                           target.getDeparameterizedSourceURI(),
  @@ -450,7 +489,7 @@
                           "Resource not found: " + status);
                   }
   
  -                if (followLinks && !confirmExtension) {
  +                if (gatheredLinks != null) {
                       for (Iterator it = gatheredLinks.iterator();it.hasNext();) {
                           String linkURI = (String) it.next();
                           Target linkTarget = target.getDerivedTarget(linkURI);
  @@ -470,7 +509,6 @@
                       linkCount = gatheredLinks.size();
                   }
   
  -                pageGenerated(target.getSourceURI(), linkCount, 0); // @todo@ get the number
of pages remaining here
               } catch (ProcessingException pe) {
                   output.close();
                   output = null;
  @@ -482,11 +520,21 @@
   
                       ModifiableSource source = getSource(target);
                       try {
  +                        pageSize = output.size();
                           OutputStream stream = source.getOutputStream();
   
                           output.setFileOutputStream(stream);
                           output.flush();
                           output.close();
  +                        pageGenerated(target.getSourceURI(), 
  +                                      target.getAuthlessDestURI(), 
  +                                      pageSize,
  +                                      linkCount,
  +                                      newLinkCount,
  +                                      0, //pagesRemaining,  @TODO@ Implement this
  +                                      0, //pagesComplete,   @TODO@ Implement this
  +                                      System.currentTimeMillis()- startTimeMillis);
  +
                       } catch (IOException ioex) {
                           log.warn(ioex.toString());
                       } finally {
  @@ -499,11 +547,6 @@
               this.sendBrokenLinkWarning(target.getSourceURI(), "URI not found");
           }
   
  -/*  Commenting out timestamp - will reimplement properly using the BeanListener interface
  -        double d = (System.currentTimeMillis()- startTimeMillis);
  -        String time = " [" + (d/1000) + " seconds]";
  -        System.out.println("        "+ time);
  -*/
           return targets;
       }
   
  @@ -520,7 +563,7 @@
               //String brokenFile = NetUtils.decodePath(destinationURI);
               
               if (brokenLinkExtension != null) {
  -                target.setExtension(brokenLinkExtension);
  +                target.setExtraExtension(brokenLinkExtension);
               }
               SimpleNotifyingBean n = new SimpleNotifyingBean(this);
               n.setType("resource-not-found");
  @@ -596,5 +639,13 @@
               }
           }
           return included;
  +    }
  +    private boolean isCrawlablePage(Target target) {
  +        if (includeLinkExtensions == null) {
  +            return true;
  +        } else {
  +            String extension = target.getExtension();
  +            return includeLinkExtensions.contains(target.getExtension());
  +        }
       }
   }
  
  
  
  1.6       +17 -2     cocoon-2.1/src/java/org/apache/cocoon/bean/Target.java
  
  Index: Target.java
  ===================================================================
  RCS file: /home/cvs/cocoon-2.1/src/java/org/apache/cocoon/bean/Target.java,v
  retrieving revision 1.5
  retrieving revision 1.6
  diff -u -r1.5 -r1.6
  --- Target.java	15 Sep 2003 19:18:18 -0000	1.5
  +++ Target.java	18 Sep 2003 12:11:50 -0000	1.6
  @@ -186,7 +186,7 @@
        * stand out, within the file structure of the generated site, by, for
        * example, adding '.error' to the end of the filename.
        */
  -    public void setExtension(String extension) {
  +    public void setExtraExtension(String extension) {
           this.extension = extension;
           this.finalDestinationURI = null;
       }
  @@ -230,6 +230,13 @@
           return NetUtils.getPath(this.getSourceURI());
       }
   
  +    /**
  +     * Gets the file extension for the source URI
  +     */
  +    public String getExtension() {
  +        return NetUtils.getExtension(this.getSourceURI());
  +    }
  +    
       /** 
        * Gets the parent URI (the URI of the page that contained
        * a link to this URI). null is returned if this page was
  @@ -325,6 +332,14 @@
           return NetUtils.relativize(path, actualSourceURI);
       }
   
  +    /**
  +     * 
  +     * @return
  +     */
  +    public String getAuthlessDestURI() throws ProcessingException {
  +        return NetUtils.removeAuthorisation(this.getDestinationURI());
  +    }
  +    
       /**
        * Gets the original URI used to create this Target.
        * This URI is completely unprocessed.
  
  
  
  1.4       +10 -1     cocoon-2.1/src/java/org/apache/cocoon/bean/helpers/DelayedOutputStream.java
  
  Index: DelayedOutputStream.java
  ===================================================================
  RCS file: /home/cvs/cocoon-2.1/src/java/org/apache/cocoon/bean/helpers/DelayedOutputStream.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- DelayedOutputStream.java	10 Jun 2003 11:17:25 -0000	1.3
  +++ DelayedOutputStream.java	18 Sep 2003 12:11:50 -0000	1.4
  @@ -250,4 +250,13 @@
               throw new IOException("No outputstream available!");
           }
       }
  +    /**
  +     * Gets the size of the content of the current output stream
  +     */
  +    public int size() {
  +        if (baos != null) {
  +            return baos.size();
  +        }
  +        return 0;
  +    }
   }
  
  
  
  1.2       +40 -10    cocoon-2.1/src/java/org/apache/cocoon/bean/helpers/OutputStreamListener.java
  
  Index: OutputStreamListener.java
  ===================================================================
  RCS file: /home/cvs/cocoon-2.1/src/java/org/apache/cocoon/bean/helpers/OutputStreamListener.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- OutputStreamListener.java	15 Sep 2003 19:18:18 -0000	1.1
  +++ OutputStreamListener.java	18 Sep 2003 12:11:50 -0000	1.2
  @@ -93,15 +93,28 @@
                                 String destinationURI, 
                                 int pageSize,
                                 int linksInPage, 
  -                              int newLinksinPage, 
  +                              int newLinksInPage, 
                                 int pagesRemaining, 
  -                              int pageComplete, 
  +                              int pagesComplete, 
                                 long timeTaken) {
  +        double time = (((double)timeTaken)/1000);
  +        
  +        String size;
  +        if (pageSize < 1024) {
  +            size = pageSize + "b";
  +        } else {
  +            size = ((float)((int)(pageSize/102.4)))/10 + "Kb";
  +        }
  +        
           if (linksInPage == -1) {
               this.print("* " + sourceURI);
           } else {
  -            this.print("* ["+linksInPage + "] "+sourceURI);
  -        }
  +            this.print(pad(13, "* ["+linksInPage + "/" + newLinksInPage + "] ") +
  +                       pad(7,time + "s ") +
  +                       pad(8, size) +
  +                       sourceURI);
  +        }     
  +           
       }
       public void messageGenerated(String msg) {
           this.print(msg);
  @@ -112,7 +125,7 @@
       }
   
       public void brokenLinkFound(String uri, String parentURI, String message, Throwable
t) {
  -        this.print("X [0] "+uri+"\tBROKEN: "+message);
  +        this.print(pad(28,"X [0] ")+uri+"\tBROKEN: "+message);
           brokenLinks.add(uri + "\t" + message);
           
   //            StringWriter sw = new StringWriter();
  @@ -121,8 +134,8 @@
   
       }
   
  -    public void pageSkipped(String uri) {
  -        // @TODO@ Do something
  +    public void pageSkipped(String uri, String message) {
  +        this.print("^ "+uri);
       }
       
       public void complete() {
  @@ -130,8 +143,7 @@
   
           long duration = System.currentTimeMillis() - startTimeMillis;
           this.print("Total time: " + (duration / 60000) + " minutes " + (duration % 60000)/1000
+ " seconds");
  -        writer.flush();
  -        writer.close();
  +        this.close();
       }
   
       public boolean isSuccessful() {
  @@ -184,7 +196,25 @@
           }
       }
   
  +    private String pad(int chars, String str) {
  +        int len = str.length();
  +        if (len < chars) {
  +            StringBuffer sb = new StringBuffer(chars > len ? chars+1 : len+1);
  +            sb.append(str);
  +            for (int i=len; i<chars; i++) {
  +                sb.append(" ");
  +            }
  +            return sb.toString();
  +        }
  +        return str;
  +    }
  +    
       private void print(String message) {
           writer.println(message);
  +        writer.flush();
  +    }
  +    
  +    private void close() {
  +        writer.close();
       }
   }
  
  
  
  1.4       +10 -1     cocoon-2.1/src/java/org/apache/cocoon/util/NetUtils.java
  
  Index: NetUtils.java
  ===================================================================
  RCS file: /home/cvs/cocoon-2.1/src/java/org/apache/cocoon/util/NetUtils.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- NetUtils.java	27 May 2003 08:46:58 -0000	1.3
  +++ NetUtils.java	18 Sep 2003 12:11:50 -0000	1.4
  @@ -447,4 +447,13 @@
           return pars;
       }
   
  +    /**
  +     * Remove any authorisation details from a URI
  +     */
  +    public static String removeAuthorisation(String uri) {
  +        if (uri.indexOf("@")!=-1 && (uri.startsWith("ftp://") || uri.startsWith("http://")))
{
  +            return uri.substring(0, uri.indexOf(":")+2)+uri.substring(uri.indexOf("@")+1);
  +        } 
  +        return uri;
  +    }
   }
  
  
  

Mime
View raw message