commons-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From bur...@apache.org
Subject cvs commit: jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate AnchorParser.java AnchorParserListener.java ProbeLocator.java
Date Thu, 20 Jan 2005 09:00:37 GMT
burton      2005/01/20 01:00:37

  Modified:    feedparser/src/java/org/apache/commons/feedparser
                        DefaultFeedDirectoryParserListener.java
                        FeedDirectoryParserListener.java FeedFilter.java
                        FeedParser.java
               feedparser/src/java/org/apache/commons/feedparser/impl
                        DebugFeedParserListener.java
               feedparser/src/java/org/apache/commons/feedparser/locate
                        AnchorParser.java AnchorParserListener.java
                        ProbeLocator.java
  Added:       feedparser/src/java/org/apache/commons/feedparser
                        HTMLFeedParser.java
  Log:
  Experimental support for XFN to verify that the format can work within the FeedParser infra
  
  Revision  Changes    Path
  1.3       +6 -1      jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/DefaultFeedDirectoryParserListener.java
  
  Index: DefaultFeedDirectoryParserListener.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/DefaultFeedDirectoryParserListener.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- DefaultFeedDirectoryParserListener.java	28 Feb 2004 03:35:21 -0000	1.2
  +++ DefaultFeedDirectoryParserListener.java	20 Jan 2005 09:00:37 -0000	1.3
  @@ -39,6 +39,11 @@
   
       public void onItemEnd() throws FeedParserException {}
   
  +    public void onRelation( FeedParserState state,
  +                            String value ) {}
  +
  +    public void onRelationEnd() {}
  +
       public void onFolder( FeedParserState state,
                             String name ) throws FeedParserException {}
   
  
  
  
  1.3       +37 -8     jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/FeedDirectoryParserListener.java
  
  Index: FeedDirectoryParserListener.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/FeedDirectoryParserListener.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- FeedDirectoryParserListener.java	28 Feb 2004 03:35:21 -0000	1.2
  +++ FeedDirectoryParserListener.java	20 Jan 2005 09:00:37 -0000	1.3
  @@ -32,8 +32,12 @@
    * <dl>
    *     <dt>FDML</dt>
    *     <dd>http://www.intertwingly.net/wiki/fdml/</dd>
  - *     <dt>OPML</dt>
  - *     <dt>OCS</dt>
  + * 
  + *     <dt>OPML (Outline Processor Markup Language)</dt>
  + *     <dt>OCS (Open Content Syndication)</dt>
  + * 
  + *     <dt>XFN (XHTML Friends Network)</dt>
  + * 
    * </dl>
    * 
    * @author <a href="mailto:burton@apache.org">Kevin A. Burton (burtonator)</a>
  @@ -42,12 +46,19 @@
   public interface FeedDirectoryParserListener extends FeedParserListener {
   
       /**
  -     * Called when an directory item is found.
  +     * Called when an directory item is found.  This is compatible with the
  +     * FeedParserListener so that existing implementations work.  This provides
  +     * a mechanism to index FDML, OPML, OCS, etc with existing feed parsers.
        *
  -     * @param weblog The HTML URL to the root of the weblog.  
  -     *               Example.  http://www.peerfear.org
  +     * @param weblog The HTML URL to the root of the weblog.  Example:
  +     * http://www.peerfear.org
        *
  -     * @param feed The XML URL to the RSS/Atom feed for this weblog.
  +     * @param title The title of the feed or weblog.  Maybe be null when not
  +     * specified.
  +     * 
  +     * @param feed The XML URL to the RSS/Atom feed for this weblog.  This may
  +     * be null in some situations when we don't have a feed URL
  +     * 
        * @see FeedParserListener#onItem
        * @author <a href="mailto:burton@peerfear.org">Kevin A. Burton</a>
        */
  @@ -60,8 +71,26 @@
       public void onItemEnd() throws FeedParserException;
   
       /**
  +     * Called when we've found a relation for a given item.  This way you can
  +     * specify the relationship you have with a given entry in your directory.
  +     * This is mostly for compatibility purposes with XFN so that the values can
  +     * be 'met', 'date', 'sweetheart', 'friend'.
  +     *
  +     * For XFN we would call onItem() methods and then onRelation() methods with
  +     * each of the relations passed.
  +     * 
  +     * @author <a href="mailto:burton@peerfear.org">Kevin A. Burton</a>
  +     */
  +    public void onRelation( FeedParserState state,
  +                            String value );
  +
  +    public void onRelationEnd();
  +
  +    /**
        * Called when a new Folder is found.  If feeds are in the default root
  -     * folder this method is not called.
  +     * folder this method is not called.  This is mostly for OPML support but
  +     * could be used within other feed formats.  When this method isn't called
  +     * one could assume that items are in the 'root' folder or no folder.
        *
        * @author <a href="mailto:burton@peerfear.org">Kevin Burton</a>
        */
  
  
  
  1.6       +10 -4     jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/FeedFilter.java
  
  Index: FeedFilter.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/FeedFilter.java,v
  retrieving revision 1.5
  retrieving revision 1.6
  diff -u -r1.5 -r1.6
  --- FeedFilter.java	14 Sep 2004 01:32:04 -0000	1.5
  +++ FeedFilter.java	20 Jan 2005 09:00:37 -0000	1.6
  @@ -28,10 +28,13 @@
    */
   public class FeedFilter {
   
  +    public static boolean DO_REMOVE_LEADING_PROLOG = true;
  +    public static boolean DO_DECODE_ENTITIES = true;
  +
       public static HashMap LATIN1_ENTITIES = new HashMap();
   
       private static Pattern entity_pattern = Pattern.compile( "&([a-zA-Z]+);" );
  -    
  +
       public static byte[] parse( byte[] bytes )
           throws Exception {
   
  @@ -52,9 +55,12 @@
           throws Exception {
   
           //remove leading prolog...
  +        if ( DO_REMOVE_LEADING_PROLOG )
  +            content = doRemoveLeadingProlog( content, encoding );
   
  -        content = doRemoveLeadingProlog( content, encoding );
  -        content = doDecodeEntities( content );
  +        //decode HTML entities that are referenced.
  +        if ( DO_DECODE_ENTITIES )
  +            content = doDecodeEntities( content );
           
           return content.getBytes( encoding );
   
  
  
  
  1.11      +28 -14    jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/FeedParser.java
  
  Index: FeedParser.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/FeedParser.java,v
  retrieving revision 1.10
  retrieving revision 1.11
  diff -u -r1.10 -r1.11
  --- FeedParser.java	3 Sep 2004 19:46:47 -0000	1.10
  +++ FeedParser.java	20 Jan 2005 09:00:37 -0000	1.11
  @@ -60,15 +60,15 @@
   
           try { 
   
  -            is = getCorrectInputStream( is );
  +            // Need to massage our XML support for UTF-8 to prevent the dreaded
  +            // "Invalid byte 1 of 1-byte UTF-8 sequence" content bug in some
  +            // default feeds.  This was tested a great deal under NewsMonster
  +            // and I'm happy with the results.  Within FeedParser 2.0 we will be
  +            // using SAX2 so this won't be as big of a problem.  In FeedParser
  +            // 2.0 (or as soon as we use SAX) this code should be totally
  +            // removed to use the original stream.
   
  -            // Need to massage our XML support forfor UTF-8 to prevent the
  -            // dreaded "Invalid byte 1 of 1-byte UTF-8 sequence" content bug in
  -            // some default feeds.  This was tested a great deal under
  -            // NewsMonster and I'm happy with the results.  Within FeedParser
  -            // 2.0 we will be using SAX2 so this won't be as big of a problem.
  -            // In FeedParser 2.0 (or as soon as we use SAX) this code should be
  -            // totally removed to use the original stream.
  +            is = getCorrectInputStream( is );
   
               //OK.  Now we have the right InputStream so we should build our DOM
               //and exec.
  @@ -81,7 +81,14 @@
           } catch ( FeedParserException fpe ) {
               //if an explicit FeedParserException is thrown just rethrow it..
               throw fpe;
  -        } catch ( Throwable t ) { throw new FeedParserException( t ); }
  +        } catch ( Throwable t ) {
  +
  +            //FIXME: when this is a JDOM or XML parser Exception we should
  +            //detect when we're working with an XHTML or HTML file and then
  +            //parse it with an XFN/XOXO event listener.
  +            
  +            throw new FeedParserException( t );
  +        }
   
       }
   
  @@ -95,8 +102,11 @@
   
           byte[] bytes = toByteArray( is );
   
  -        //FIXME: if we return the WRONG content type here we will royally fuck
  -        //up getByets... UTF-16 and UTF-32 especially
  +        //FIXME: if we return the WRONG content type here we will break.
  +        //getBytes()... UTF-16 and UTF-32 especially.  We should also perform
  +        //HTTP Content-Type parsing here to preserve the content type.  This can
  +        //be fixed by integrating our networking API from NewsMonster.
  +        
           String encoding = XMLEncodingParser.parse( bytes );
   
           if ( encoding == null )
  @@ -150,6 +160,7 @@
                   return;
               }
   
  +            //Handle changes.xml
               if ( "weblogUpdates".equals( root ) ) {
                   ChangesFeedParser.parse( listener, doc );
                   return;
  @@ -163,12 +174,15 @@
   
               //Handle FOAF
               if ( doc.getRootElement().getChildren( "Person", NS.FOAF ).size() > 0 )
{
  -
                   FOAFFeedParser.parse( listener, doc );
  -                
                   return;
               }
   
  +            //FIXME: if this is XHTML we need to handle this with either an XFN
  +            //or an XOXO directory parser.  There might be more metadata we need
  +            //to parse here.  (also I wonder if this could be a chance to do
  +            //autodiscovery).
  +            
               //fall back on RDF and RSS
   
               RSSFeedParser.parse( listener, doc );
  
  
  
  1.1                  jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/HTMLFeedParser.java
  
  Index: HTMLFeedParser.java
  ===================================================================
  /*
   * Copyright 1999,2004 The Apache Software Foundation.
   * 
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
   * You may obtain a copy of the License at
   * 
   *      http://www.apache.org/licenses/LICENSE-2.0
   * 
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */
  
  package org.apache.commons.feedparser;
  
  import org.apache.commons.feedparser.impl.*;
  import org.apache.commons.feedparser.locate.*;
  
  import java.io.*;
  import java.net.*;
  import java.util.*;
  
  /**
   *
   * Experimental class to play with supporting XFN.  HTML parsing in general is
   * interesting because I could start with teh AnchorParser and move to an HTML
   * parser but that might be too generic.
   * 
   * @author <a href="mailto:burton@apache.org">Kevin A. Burton (burtonator)</a>
   * @version $Id: HTMLFeedParser.java,v 1.1 2005/01/20 09:00:37 burton Exp $
   */
  public class HTMLFeedParser extends BaseParser {
  
      public static final HashSet XFN_RELATIONS = new HashSet();
      
      public static void parse( String content, final FeedParserListener listener ) throws
Exception {
  
          if ( listener instanceof FeedDirectoryParserListener == false )
              return;
          
          final FeedDirectoryParserListener directoryParserLisener =
              (FeedDirectoryParserListener)listener;
  
          directoryParserLisener.init();
          
          final FeedParserState state = new FeedParserState();
          
          AnchorParserListener alistener = new AnchorParserListener() {
  
                  public void setContext( Object context ) {}
  
                  public Object getResult() { return null; }
  
                  public boolean onAnchor( String href, String rel, String title )
                      throws AnchorParserException {
  
                      try {
                          
                          if ( rel == null || "".equals( rel ) )
                              return true;
  
                          //right now these aren't valid here
                          String description = null;
                          String feed = null;
                      
                          //FIXME: only include onItem when we have at least ONE XFN
                          //relations that valid.
  
                          directoryParserLisener.onItem( state, title, href, description,
feed );
                      
                          String[] rels = rel.split( " " );
  
                          for ( int i = 0; i < rels.length; ++i ) {
  
                              String current = rels[i];
  
                              //FIXME: when this current rel is NOT part of any XFN
                              //spec we should not be using the feed parser listener
                              //because it might just be a nofollow link or such.
  
                              boolean isXFriendRel = XFN_RELATIONS.contains( current );
  
                              if ( isXFriendRel ) {
                          
                                  directoryParserLisener.onRelation( state,
                                                                     current );
                              
                                  directoryParserLisener.onRelationEnd();
  
                              }
                          
                          }
  
                          directoryParserLisener.onItemEnd();
                      
                          //split this into individual rels... then call them.
                      
                          return true;
  
                      } catch ( Exception e ) {
                          throw new AnchorParserException( e );
                      }
  
                  }
  
              };
  
          AnchorParser.parse( content, alistener );
  
          directoryParserLisener.finished();
  
      }
  
      public static void main( String[] args ) throws Exception {
  
          FeedParserListener listener = new DebugFeedParserListener();
          
          parse( "<a href='http://jane-blog.example.org/' rel='sweetheart date met'>Jane</a>
",
                 listener );
          
      }
  
      static {
  
          XFN_RELATIONS.add( "contact" );
          XFN_RELATIONS.add( "acquaintance" );
          XFN_RELATIONS.add( "friend" );
          XFN_RELATIONS.add( "met" );
          XFN_RELATIONS.add( "co-worker" );
          XFN_RELATIONS.add( "colleague" );
          XFN_RELATIONS.add( "co-resident" );
          XFN_RELATIONS.add( "neighbor" );
          XFN_RELATIONS.add( "child" );
          XFN_RELATIONS.add( "parent" );
          XFN_RELATIONS.add( "sibling" );
          XFN_RELATIONS.add( "spouse" );
          XFN_RELATIONS.add( "kin" );
          XFN_RELATIONS.add( "muse" );
          XFN_RELATIONS.add( "crush" );
          XFN_RELATIONS.add( "date" );
          XFN_RELATIONS.add( "sweetheart" );
          XFN_RELATIONS.add( "me" );
          
      }
      
  }
  
  
  
  
  1.11      +12 -4     jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/impl/DebugFeedParserListener.java
  
  Index: DebugFeedParserListener.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/impl/DebugFeedParserListener.java,v
  retrieving revision 1.10
  retrieving revision 1.11
  diff -u -r1.10 -r1.11
  --- DebugFeedParserListener.java	17 Oct 2004 23:43:24 -0000	1.10
  +++ DebugFeedParserListener.java	20 Jan 2005 09:00:37 -0000	1.11
  @@ -195,7 +195,15 @@
   
       }
   
  -    
  -    
  -}
  +    public void onRelation( FeedParserState state,
  +                            String value ) {
   
  +        out.println( "onRelation: " + value );
  +        
  +    }
  +
  +    public void onRelationEnd() {
  +        out.println( "onRelationEnd" );
  +    }
  +
  +} 
  \ No newline at end of file
  
  
  
  1.8       +4 -2      jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/AnchorParser.java
  
  Index: AnchorParser.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/AnchorParser.java,v
  retrieving revision 1.7
  retrieving revision 1.8
  diff -u -r1.7 -r1.8
  --- AnchorParser.java	19 Jan 2005 06:01:41 -0000	1.7
  +++ AnchorParser.java	20 Jan 2005 09:00:37 -0000	1.8
  @@ -35,7 +35,8 @@
                                                 Pattern.CASE_INSENSITIVE | Pattern.MULTILINE
);
   
       public static void parse( String content,
  -                              AnchorParserListener listener ) {
  +                              AnchorParserListener listener )
  +        throws AnchorParserException {
   
           parseAnchors( content, listener );
           
  @@ -48,7 +49,8 @@
        * @author <a href="mailto:burton@peerfear.org">Kevin A. Burton</a>
        */
       public static void parseAnchors( String content,
  -                                     AnchorParserListener listener ) {
  +                                     AnchorParserListener listener )
  +        throws AnchorParserException {
   
           int index = 0;
   
  
  
  
  1.3       +2 -1      jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/AnchorParserListener.java
  
  Index: AnchorParserListener.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/AnchorParserListener.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- AnchorParserListener.java	15 Apr 2004 16:57:20 -0000	1.2
  +++ AnchorParserListener.java	20 Jan 2005 09:00:37 -0000	1.3
  @@ -42,6 +42,7 @@
        *
        * @author <a href="mailto:burton@peerfear.org">Kevin Burton</a>
        */
  -    public boolean onAnchor( String href, String rel, String title );
  +    public boolean onAnchor( String href, String rel, String title )
  +        throws AnchorParserException;
   
   }
  
  
  
  1.18      +2 -2      jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/ProbeLocator.java
  
  Index: ProbeLocator.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/ProbeLocator.java,v
  retrieving revision 1.17
  retrieving revision 1.18
  diff -u -r1.17 -r1.18
  --- ProbeLocator.java	18 Jan 2005 19:39:36 -0000	1.17
  +++ ProbeLocator.java	20 Jan 2005 09:00:37 -0000	1.18
  @@ -79,9 +79,9 @@
           // fail-fast if we already have some results and if we determine that
           // we can trust the results (TextAmerica has invalid autodiscovery,
           // for example)
  -        if ( list.size() > 0 && blogService.hasValidAutodiscovery() )
  +        if ( list.size() > 0 && blogService.hasValidAutoDiscovery() )
               return list;
  -        else if ( blogService.hasValidAutodiscovery() == false ) {
  +        else if ( blogService.hasValidAutoDiscovery() == false ) {
               // clear out the list so far since we can't trust the results
               list.clear();
           }
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: commons-dev-help@jakarta.apache.org


Mime
View raw message