commons-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From bur...@apache.org
Subject cvs commit: jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate ResourceExpander.java DiscoveryLocator.java FeedLocator.java
Date Thu, 15 Apr 2004 04:57:18 GMT
burton      2004/04/14 21:57:18

  Modified:    feedparser/src/java/org/apache/commons/feedparser/locate
                        DiscoveryLocator.java FeedLocator.java
  Added:       feedparser TODO_DATES
               feedparser/src/java/org/apache/commons/feedparser/locate
                        ResourceExpander.java
  Log:
  ability to expand URLs
  
  Revision  Changes    Path
  1.1                  jakarta-commons-sandbox/feedparser/TODO_DATES
  
  Index: TODO_DATES
  ===================================================================
  
  **** RSS 2.0  ****
      
  http://asg.web.cmu.edu/rfc/rfc822.html#sec-5
  
  5.1 SYNTAX
  
       date-time   =  [ day "," ] date time        ; dd mm yy
                                                   ;  hh:mm:ss zzz
  
       day         =  "Mon"  / "Tue" /  "Wed"  / "Thu"
                   /  "Fri"  / "Sat" /  "Sun"
  
       date        =  1*2DIGIT month 2DIGIT        ; day month year
                                                   ;  e.g. 20 Jun 82
  
       month       =  "Jan"  /  "Feb" /  "Mar"  /  "Apr"
                   /  "May"  /  "Jun" /  "Jul"  /  "Aug"
                   /  "Sep"  /  "Oct" /  "Nov"  /  "Dec"
  
       time        =  hour zone                    ; ANSI and Military
  
       hour        =  2DIGIT ":" 2DIGIT [":" 2DIGIT]
                                                   ; 00:00:00 - 23:59:59
  
       zone        =  "UT"  / "GMT"                ; Universal Time
                                                   ; North American : UT
                   /  "EST" / "EDT"                ;  Eastern:  - 5/ - 4
                   /  "CST" / "CDT"                ;  Central:  - 6/ - 5
                   /  "MST" / "MDT"                ;  Mountain: - 7/ - 6
                   /  "PST" / "PDT"                ;  Pacific:  - 8/ - 7
                   /  1ALPHA                       ; Military: Z = UT;
                                                   ;  A:-1; (J not used)
                                                   ;  M:-12; N:+1; Y:+12
                   / ( ("+" / "-") 4DIGIT )        ; Local differential
                                                   ;  hours+min. (HHMM)
  
  5.2 SEMANTICS
  
      If included, day-of-week must be the day implied by the date specification.
  
      Time zone may be indicated in several ways. "UT" is Univer- sal Time
      (formerly called "Greenwich Mean Time"); "GMT" is per- mitted as a reference
      to Universal Time. The military standard uses a single character for each
      zone. "Z" is Universal Time. "A" indicates one hour earlier, and "M"
      indicates 12 hours ear- lier; "N" is one hour later, and "Y" is 12 hours
      later. The letter "J" is not used. The other remaining two forms are taken
      from ANSI standard X3.51-1975. One allows explicit indication of the amount
      of offset from UT; the other uses common 3-character strings for indicating
      time zones in North America.
  
  
  
  1.4       +32 -28    jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/DiscoveryLocator.java
  
  Index: DiscoveryLocator.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/DiscoveryLocator.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- DiscoveryLocator.java	15 Apr 2004 00:58:44 -0000	1.3
  +++ DiscoveryLocator.java	15 Apr 2004 04:57:17 -0000	1.4
  @@ -18,6 +18,7 @@
   
   import java.io.*;
   import java.util.*;
  +import java.util.regex.*;
   
   //FIXME: do NOT use apache regex as it has major problems.
   import org.apache.regexp.*;
  @@ -28,6 +29,24 @@
    */
   public class DiscoveryLocator {
   
  +    public static final String ATOM_MEDIA_TYPE = "application/atom+xml";
  +    public static final String RSS_MEDIA_TYPE  = "application/rss+xml";
  +    public static final String XML_MEDIA_TYPE  = "text/xml";
  +    
  +    //NOTE: this will break if the attributes aren't in the right order.
  +    static Pattern pattern =
  +        Pattern.compile( "<link[^>]+type=[\"']([^\"']+)[\"'][^>]+href=[\"']([^\"']+)"
);
  +
  +    static HashSet mediatypes = new HashSet();
  +
  +    static {
  +
  +        mediatypes.add( ATOM_MEDIA_TYPE );
  +        mediatypes.add( RSS_MEDIA_TYPE );
  +        mediatypes.add( XML_MEDIA_TYPE );
  +        
  +    }
  +
       /**
        * 
        *
  @@ -38,41 +57,26 @@
   
           //this mechanism is easier but it isn't efficient.  I should just parse
           //elements forward until I discover </head>.  Also note that this isn't
  -        //doing all feed URLs just the first ones it finds.
  +        //doing all feed URLs just the first ones it finds.  
   
  -        doDiscovery( "application/atom\\+xml", content, list );
  -        doDiscovery( "application/rss\\+xml", content, list );
  -        doDiscovery( "text/xml", content, list );
  +        Matcher m = pattern.matcher( content );
   
  -        return list;
  -        
  -    }
  -
  -    private static void doDiscovery( String type, String content, List list ) {
  +        while( m.find() ) {
   
  -        String resource = getLinkType( type, content );
  +            String type=m.group( 1 );
   
  -        if ( resource != null )
  -            list.add( new FeedReference( resource, type ) );
  -
  -    }
  +            if ( mediatypes.contains( type )  ) {
   
  -    /**
  -     * Get the first link with the given type.
  -     *
  -     * @author <a href="mailto:burton@peerfear.org">Kevin A. Burton</a>
  -     */
  -    private static String getLinkType( String type, String content ) {
  -
  -        RE regexp = new RE( "<link[^>]+type=[\"']" + type + "[\"'][^>]+href=[\"']([^\"']+)"
);
  -
  -        //var regexp = new RE( "<link[^>]+type=\"application/rss\\+xml" );
  +                //FIXME: expand the href
  +                String href = m.group( 2 );
  +                list.add( new FeedReference( href, type ) );
  +                return list;
  +            }
  +            
  +        }
           
  -        if ( regexp.match( content ) )
  -            return regexp.getParen( 1 );
  +        return list;
           
  -        return null;
  -
       }
   
   }
  
  
  
  1.4       +1 -0      jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/FeedLocator.java
  
  Index: FeedLocator.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/FeedLocator.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- FeedLocator.java	15 Apr 2004 00:58:44 -0000	1.3
  +++ FeedLocator.java	15 Apr 2004 04:57:18 -0000	1.4
  @@ -72,6 +72,7 @@
       public static void main( String[] args ) throws Exception {
   
           String resource = "http://diveintomark.org";
  +        //String resource = "http://peerfear.org";
   
           List l = locate( resource );
   
  
  
  
  1.1                  jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/ResourceExpander.java
  
  Index: ResourceExpander.java
  ===================================================================
  /*
   * Copyright 1999,2004 The Apache Software Foundation.
   * 
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
   * You may obtain a copy of the License at
   * 
   *      http://www.apache.org/licenses/LICENSE-2.0
   * 
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */
  
  package org.apache.commons.feedparser.locate;
  
  import java.io.*;
  import java.net.*;
  import java.util.*;
  
  /**
   *
   * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
   * @version $Id: ResourceExpander.java,v 1.1 2004/04/15 04:57:18 burton Exp $
   */
  public class ResourceExpander {
  
      /**
       * Expand a link relavant to the current site.  This takes care of links
       * such as
       *
       * /foo.html -> http://site.com/base/foo.html
       *
       * foo.html -> http://site.com/base/foo.html
       *
       * Links should *always* be expanded before they are used.
       *
       * This is because if we use the URL http://site.com/base then we don't know
       * if it's a directory or a file.  http://site.com/base/ would be a directory.
       * 
       * Note that all resource URLs will have correct trailing slashes.  If the URL
       * does not end with / then it is a file URL and not a directory.
       *
       * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
       */
      public static String expand( String resource, String link ) throws Exception {
  
          //make sure we can use this.
          if ( isInvalidScheme( link ) )
              return link;
  
          //nothing if ALREADY relativized
          if ( isExpanded( link ) )
              return link;
  
          //    From: http://www.w3.org/Addressing/rfc1808.txt
          //        
          //    If the parse string begins with a double-slash "//", then the
          //    substring of characters after the double-slash and up to, but not
          //    including, the next slash "/" character is the network
          //    location/login (<net_loc>) of the URL.  If no trailing slash "/"
          //    is present, the entire remaining parse string is assigned to
          //    <net_loc>.  The double- slash and <net_loc> are removed from the
          //    parse string before
          
          if ( link.startsWith( "//" ) ) {
  
              return "http:" + link;
              
          } 
  
          //keep going
          
          if ( link.startsWith( "/" ) ) {
              
              link = getSite( resource ) + link;
  
          } else if ( link.startsWith( "#" ) ) {
  
              link = resource + link;
               
          } else if ( link.startsWith( ".." ) ) {
              
              //ok.  We need to get rid of these .. directories.
  
              String base = getBase( resource ) + "/";
              
              while ( link.startsWith( ".." ) ) {
                  
                  //get rid of the first previous dir in the link
                  int begin = 2;
                  if ( link.length() > 2 && link.charAt( 2 ) == '/' )
                      begin = 3;
  
                  link = link.substring( begin, link.length() );
                  
                  //get rid of the last directory in the resource
  
                  int end = base.length();
  
                  if ( base.endsWith( "/" ) )
                       --end;
  
                  base = base.substring( 0, base.lastIndexOf( "/", end - 1 ) );
  
              }
  
              link = base + "/" + link;
  
          } else if ( link.startsWith( "http://" ) == false ) {
  
              String base = getBase( resource );
  
              link = base + "/" + link;
  
          } 
  
          return link;
          
      }
  
      /**
       * Return true if the given link is ALREADY relativized..
       *
       * @author <a href="mailto:burton@peerfear.org">Kevin Burton</a>
       */
      public static boolean isExpanded( String resource ) {
          return resource.startsWith( "http://" );
      }
      
      /**
       * Return true if this is an invalid scheme and should be expanded
       * (javascript, mailto, etc)
       *
       * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
       */
      public static boolean isInvalidScheme( String resource ) {
  
          //only on file: and http:
  
          if ( resource.startsWith( "http:" ) )
              return true;
  
          if ( resource.startsWith( "file:" ) )
              return true;
  
          return false;
          
      }
  
      /**
       * Get the site for this resource.  For example:
       *
       * http://www.foo.com/directory/index.html
       *
       * we will return
       *
       * http://www.foo.com
       *
       * for file: URLs we return file://
       *
       * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
       */
      public static String getSite( String resource ) {
  
          if ( resource.startsWith( "file:" ) ) {
              return "file://";
          } 
  
          //start at 8 which is the width of http://
          int end = resource.indexOf( "/", 8 );
  
          if ( end == -1 ) {
  
              end = resource.length();
  
          } 
  
          return resource.substring( 0, end );
  
      }
  
      /**
       * Get the base of this URL.  For example if we are given:
       *
       * http://www.foo.com/directory/index.html
       *
       * we will return
       *
       * http://www.foo.com/directory
       *
       *
       * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
       */
      public static String getBase( String resource ) {
  
          int begin = "http://".length() + 1;
          
          int end = resource.lastIndexOf( "/" );
          
          if ( end == -1 || end <= begin ) {
              
              //probaby a URL like http://www.cnn.com
              
              end = resource.length();
              
          } 
  
          return resource.substring( 0, end );
          
      } 
  
      public static void main( String[] args ) throws Exception {
  
          System.out.println( expand( "http://peerfear.org/foo/bar/", "../../blog" ) );
  
          System.out.println( expand( "http://peerfear.org/foo/bar/", "../../index.html" )
);
  
          System.out.println( expand( "http://peerfear.org/blog/", ".." ) );
  
          System.out.println( expand( "http://peerfear.org", "/blog" ) );
          System.out.println( expand( "http://peerfear.org", "http://peerfear.org" ) );
  
          System.out.println( expand( "http://peerfear.org", "blog" ) );
          System.out.println( expand( "http://peerfear.org/blog", "foo/bar" ) );
  
          System.out.println( expand( "file://projects/newsmonster/", "blog" ) );
  
          System.out.println( expand( "file:/projects/ksa/src/java/ksa/test/TestFeedTask_WithRelativePath.rss"
                                        , "/blog" ) );        
      }
  
  }
  
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: commons-dev-help@jakarta.apache.org


Mime
View raw message