lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ehatc...@apache.org
Subject cvs commit: jakarta-lucene/src/java/org/apache/lucene/analysis/standard StandardAnalyzer.java
Date Fri, 12 Mar 2004 09:43:48 GMT
ehatcher    2004/03/12 01:43:48

  Modified:    src/java/org/apache/lucene/analysis StopAnalyzer.java
                        StopFilter.java
               src/java/org/apache/lucene/analysis/de GermanAnalyzer.java
                        GermanStemFilter.java WordlistLoader.java
               src/java/org/apache/lucene/analysis/ru RussianAnalyzer.java
               src/java/org/apache/lucene/analysis/standard
                        StandardAnalyzer.java
  Log:
  convert StopFilter to use Set, with supporting changes to avoid calling deprecated methods.
 never compromise on your ideals!
  
  Revision  Changes    Path
  1.3       +5 -4      jakarta-lucene/src/java/org/apache/lucene/analysis/StopAnalyzer.java
  
  Index: StopAnalyzer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/StopAnalyzer.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- StopAnalyzer.java	9 Dec 2002 19:02:20 -0000	1.2
  +++ StopAnalyzer.java	12 Mar 2004 09:43:48 -0000	1.3
  @@ -56,11 +56,12 @@
   
   import java.io.Reader;
   import java.util.Hashtable;
  +import java.util.Set;
   
   /** Filters LetterTokenizer with LowerCaseFilter and StopFilter. */
   
   public final class StopAnalyzer extends Analyzer {
  -  private Hashtable stopTable;
  +  private Set stopWords;
   
     /** An array containing some common English words that are not usually useful
       for searching. */
  @@ -74,17 +75,17 @@
   
     /** Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */
     public StopAnalyzer() {
  -    stopTable = StopFilter.makeStopTable(ENGLISH_STOP_WORDS);
  +    stopWords = StopFilter.makeStopSet(ENGLISH_STOP_WORDS);
     }
   
     /** Builds an analyzer which removes words in the provided array. */
     public StopAnalyzer(String[] stopWords) {
  -    stopTable = StopFilter.makeStopTable(stopWords);
  +    this.stopWords = StopFilter.makeStopSet(stopWords);
     }
   
     /** Filters LowerCaseTokenizer with StopFilter. */
     public TokenStream tokenStream(String fieldName, Reader reader) {
  -    return new StopFilter(new LowerCaseTokenizer(reader), stopTable);
  +    return new StopFilter(new LowerCaseTokenizer(reader), stopWords);
     }
   }
   
  
  
  
  1.10      +9 -4      jakarta-lucene/src/java/org/apache/lucene/analysis/StopFilter.java
  
  Index: StopFilter.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/StopFilter.java,v
  retrieving revision 1.9
  retrieving revision 1.10
  diff -u -r1.9 -r1.10
  --- StopFilter.java	10 Mar 2004 23:17:37 -0000	1.9
  +++ StopFilter.java	12 Mar 2004 09:43:48 -0000	1.10
  @@ -57,6 +57,7 @@
   import java.io.IOException;
   import java.util.HashSet;
   import java.util.Hashtable;
  +import java.util.Set;
   
   /**
    * Removes stop words from a token stream.
  @@ -64,7 +65,7 @@
   
   public final class StopFilter extends TokenFilter {
   
  -  private HashSet stopWords;
  +  private Set stopWords;
   
     /**
      * Constructs a filter which removes words from the input
  @@ -79,7 +80,7 @@
      * Constructs a filter which removes words from the input
      * TokenStream that are named in the Hashtable.
      *
  -   * @deprecated Use {@link #StopFilter(TokenStream, HashSet)} StopFilter(TokenStream,Map)}
instead
  +   * @deprecated Use {@link #StopFilter(TokenStream, Set)} StopFilter(TokenStream,Map)}
instead
      */
     public StopFilter(TokenStream in, Hashtable stopTable) {
       super(in);
  @@ -89,8 +90,12 @@
     /**
      * Constructs a filter which removes words from the input
      * TokenStream that are named in the Set.
  +   * It is crucial that an efficient Set implementation is used
  +   * for maximum performance.
  +   *
  +   * @see #makeStopSet(java.lang.String[])
      */
  -  public StopFilter(TokenStream in, HashSet stopWords) {
  +  public StopFilter(TokenStream in, Set stopWords) {
       super(in);
       this.stopWords = stopWords;
     }
  @@ -116,7 +121,7 @@
      * This permits this stopWords construction to be cached once when
      * an Analyzer is constructed.
      */
  -  public static final HashSet makeStopSet(String[] stopWords) {
  +  public static final Set makeStopSet(String[] stopWords) {
       HashSet stopTable = new HashSet(stopWords.length);
       for (int i = 0; i < stopWords.length; i++)
         stopTable.add(stopWords[i]);
  
  
  
  1.8       +14 -12    jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
  
  Index: GermanAnalyzer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java,v
  retrieving revision 1.7
  retrieving revision 1.8
  diff -u -r1.7 -r1.8
  --- GermanAnalyzer.java	9 Oct 2003 00:08:52 -0000	1.7
  +++ GermanAnalyzer.java	12 Mar 2004 09:43:48 -0000	1.8
  @@ -62,6 +62,8 @@
   import java.io.File;
   import java.io.Reader;
   import java.util.Hashtable;
  +import java.util.Set;
  +import java.util.HashSet;
   
   /**
    * Analyzer for German language. Supports an external list of stopwords (words that
  @@ -96,19 +98,19 @@
       /**
        * Contains the stopwords used with the StopFilter.
        */
  -    private Hashtable stoptable = new Hashtable();
  +    private Set stopSet = new HashSet();
   
       /**
        * Contains words that should be indexed but not stemmed.
        */
  -    private Hashtable excltable = new Hashtable();
  +    private Set exclusionSet = new HashSet();
   
       /**
        * Builds an analyzer.
        */
       public GermanAnalyzer()
       {
  -	stoptable = StopFilter.makeStopTable( GERMAN_STOP_WORDS );
  +	stopSet = StopFilter.makeStopSet( GERMAN_STOP_WORDS );
       }
   
       /**
  @@ -116,7 +118,7 @@
        */
       public GermanAnalyzer( String[] stopwords )
       {
  -	stoptable = StopFilter.makeStopTable( stopwords );
  +	stopSet = StopFilter.makeStopSet( stopwords );
       }
   
       /**
  @@ -124,7 +126,7 @@
        */
       public GermanAnalyzer( Hashtable stopwords )
       {
  -	stoptable = stopwords;
  +	stopSet = new HashSet(stopwords.keySet());
       }
   
       /**
  @@ -132,7 +134,7 @@
        */
       public GermanAnalyzer( File stopwords )
       {
  -	stoptable = WordlistLoader.getWordtable( stopwords );
  +	stopSet = new HashSet(WordlistLoader.getWordtable( stopwords ).keySet());
       }
   
       /**
  @@ -140,7 +142,7 @@
        */
       public void setStemExclusionTable( String[] exclusionlist )
       {
  -	excltable = StopFilter.makeStopTable( exclusionlist );
  +	exclusionSet = StopFilter.makeStopSet( exclusionlist );
       }
   
       /**
  @@ -148,7 +150,7 @@
        */
       public void setStemExclusionTable( Hashtable exclusionlist )
       {
  -	excltable = exclusionlist;
  +	exclusionSet = new HashSet(exclusionlist.keySet());
       }
   
       /**
  @@ -156,7 +158,7 @@
        */
       public void setStemExclusionTable( File exclusionlist )
       {
  -	excltable = WordlistLoader.getWordtable( exclusionlist );
  +	exclusionSet = new HashSet(WordlistLoader.getWordtable( exclusionlist ).keySet());
       }
   
       /**
  @@ -170,8 +172,8 @@
   	TokenStream result = new StandardTokenizer( reader );
   	result = new StandardFilter( result );
     // shouldn't there be a lowercaser before stop word filtering?
  -  result = new StopFilter( result, stoptable );
  -	result = new GermanStemFilter( result, excltable );
  +  result = new StopFilter( result, stopSet );
  +	result = new GermanStemFilter( result, exclusionSet );
   	return result;
       }
   }
  
  
  
  1.6       +28 -6     jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java
  
  Index: GermanStemFilter.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java,v
  retrieving revision 1.5
  retrieving revision 1.6
  diff -u -r1.5 -r1.6
  --- GermanStemFilter.java	9 Dec 2002 19:02:21 -0000	1.5
  +++ GermanStemFilter.java	12 Mar 2004 09:43:48 -0000	1.6
  @@ -59,6 +59,8 @@
   import org.apache.lucene.analysis.TokenStream;
   import java.io.IOException;
   import java.util.Hashtable;
  +import java.util.Set;
  +import java.util.HashSet;
   
   /**
    * A filter that stems German words. It supports a table of words that should
  @@ -75,7 +77,7 @@
        */
       private Token token = null;
       private GermanStemmer stemmer = null;
  -    private Hashtable exclusions = null;
  +    private Set exclusionSet = null;
       
       public GermanStemFilter( TokenStream in )
       {
  @@ -85,13 +87,24 @@
       
       /**
        * Builds a GermanStemFilter that uses an exclusiontable.
  +     * @deprecated Use {@link #GermanStemFilter(org.apache.lucene.analysis.TokenStream,
java.util.Set)} instead.
        */
       public GermanStemFilter( TokenStream in, Hashtable exclusiontable )
       {
   	this( in );
  -	exclusions = exclusiontable;
  +	exclusionSet = new HashSet(exclusiontable.keySet());
  +
       }
  -    
  +
  +    /**
  +     * Builds a GermanStemFilter that uses an exclusiontable.
  +     */
  +    public GermanStemFilter( TokenStream in, Set exclusionSet )
  +    {
  +	this( in );
  +	this.exclusionSet = exclusionSet;
  +    }
  +
       /**
        * @return  Returns the next token in the stream, or null at EOS
        */
  @@ -102,7 +115,7 @@
   	    return null;
   	}
   	// Check the exclusiontable
  -	else if ( exclusions != null && exclusions.contains( token.termText() ) ) {
  +	else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) )
{
   	    return token;
   	}
   	else {
  @@ -128,9 +141,18 @@
   
       /**
        * Set an alternative exclusion list for this filter.
  +     * @deprecated Use {@link #setExclusionSet(java.util.Set)} instead.
        */
       public void setExclusionTable( Hashtable exclusiontable )
       {
  -	exclusions = exclusiontable;
  +	exclusionSet = new HashSet(exclusiontable.keySet());
  +    }
  +
  +    /**
  +     * Set an alternative exclusion list for this filter.
  +     */
  +    public void setExclusionSet( Set exclusionSet )
  +    {
  +	this.exclusionSet = exclusionSet;
       }
   }
  
  
  
  1.6       +4 -1      jakarta-lucene/src/java/org/apache/lucene/analysis/de/WordlistLoader.java
  
  Index: WordlistLoader.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/WordlistLoader.java,v
  retrieving revision 1.5
  retrieving revision 1.6
  diff -u -r1.5 -r1.6
  --- WordlistLoader.java	10 Mar 2004 00:18:02 -0000	1.5
  +++ WordlistLoader.java	12 Mar 2004 09:43:48 -0000	1.6
  @@ -67,6 +67,8 @@
    *
    * @author    Gerhard Schwarz
    * @version   $Id$
  + *
  + * @todo refactor to convert to Sets instead of Hashtable
    */
   public class WordlistLoader {
     /**
  @@ -92,6 +94,7 @@
   
     /**
      * @param wordfile  File containing the wordlist
  +   * @todo Create a Set version of this method
      */
     public static Hashtable getWordtable(File wordfile) {
       if (wordfile == null) {
  
  
  
  1.6       +10 -7     jakarta-lucene/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
  
  Index: RussianAnalyzer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java,v
  retrieving revision 1.5
  retrieving revision 1.6
  diff -u -r1.5 -r1.6
  --- RussianAnalyzer.java	25 Jan 2004 14:18:12 -0000	1.5
  +++ RussianAnalyzer.java	12 Mar 2004 09:43:48 -0000	1.6
  @@ -60,6 +60,8 @@
   
   import java.io.Reader;
   import java.util.Hashtable;
  +import java.util.Set;
  +import java.util.HashSet;
   
   /**
    * Analyzer for Russian language. Supports an external list of stopwords (words that
  @@ -215,7 +217,7 @@
       /**
        * Contains the stopwords used with the StopFilter.
        */
  -    private Hashtable stoptable = new Hashtable();
  +    private Set stopSet = new HashSet();
   
       /**
        * Charset for Russian letters.
  @@ -227,7 +229,7 @@
   
       public RussianAnalyzer() {
           charset = RussianCharsets.UnicodeRussian;
  -        stoptable = StopFilter.makeStopTable(
  +        stopSet = StopFilter.makeStopSet(
                       makeStopWords(RussianCharsets.UnicodeRussian));
       }
   
  @@ -237,7 +239,7 @@
       public RussianAnalyzer(char[] charset)
       {
           this.charset = charset;
  -        stoptable = StopFilter.makeStopTable(makeStopWords(charset));
  +        stopSet = StopFilter.makeStopSet(makeStopWords(charset));
       }
   
       /**
  @@ -246,7 +248,7 @@
       public RussianAnalyzer(char[] charset, String[] stopwords)
       {
           this.charset = charset;
  -        stoptable = StopFilter.makeStopTable(stopwords);
  +        stopSet = StopFilter.makeStopSet(stopwords);
       }
   
       // Takes russian stop words and translates them to a String array, using
  @@ -270,11 +272,12 @@
   
       /**
        * Builds an analyzer with the given stop words.
  +     * @todo create a Set version of this ctor
        */
       public RussianAnalyzer(char[] charset, Hashtable stopwords)
       {
           this.charset = charset;
  -        stoptable = stopwords;
  +        stopSet = new HashSet(stopwords.keySet());
       }
   
       /**
  @@ -287,7 +290,7 @@
       {
           TokenStream result = new RussianLetterTokenizer(reader, charset);
           result = new RussianLowerCaseFilter(result, charset);
  -        result = new StopFilter(result, stoptable);
  +        result = new StopFilter(result, stopSet);
           result = new RussianStemFilter(result, charset);
           return result;
       }
  
  
  
  1.7       +5 -5      jakarta-lucene/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
  
  Index: StandardAnalyzer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java,v
  retrieving revision 1.6
  retrieving revision 1.7
  diff -u -r1.6 -r1.7
  --- StandardAnalyzer.java	10 Nov 2003 14:31:19 -0000	1.6
  +++ StandardAnalyzer.java	12 Mar 2004 09:43:48 -0000	1.7
  @@ -56,7 +56,7 @@
   
   import org.apache.lucene.analysis.*;
   import java.io.Reader;
  -import java.util.Hashtable;
  +import java.util.Set;
   
   /**
    * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
  @@ -65,7 +65,7 @@
    * @version $Id$
    */
   public class StandardAnalyzer extends Analyzer {
  -  private Hashtable stopTable;
  +  private Set stopSet;
   
     /** An array containing some common English words that are usually not
     useful for searching. */
  @@ -78,7 +78,7 @@
   
     /** Builds an analyzer with the given stop words. */
     public StandardAnalyzer(String[] stopWords) {
  -    stopTable = StopFilter.makeStopTable(stopWords);
  +    stopSet = StopFilter.makeStopSet(stopWords);
     }
   
     /** Constructs a {@link StandardTokenizer} filtered by a {@link
  @@ -87,7 +87,7 @@
       TokenStream result = new StandardTokenizer(reader);
       result = new StandardFilter(result);
       result = new LowerCaseFilter(result);
  -    result = new StopFilter(result, stopTable);
  +    result = new StopFilter(result, stopSet);
       return result;
     }
   }
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-dev-help@jakarta.apache.org


Mime
View raw message