lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r895339 [1/2] - in /lucene/java/trunk: ./ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/ contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/ con...
Date Sun, 03 Jan 2010 08:48:18 GMT
Author: rmuir
Date: Sun Jan  3 08:48:17 2010
New Revision: 895339

URL: http://svn.apache.org/viewvc?rev=895339&view=rev
Log:
LUCENE-2034: Refactor analyzer reuse and stopword handling

Added:
    lucene/java/trunk/src/java/org/apache/lucene/analysis/ReusableAnalyzerBase.java   (with props)
    lucene/java/trunk/src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java   (with props)
    lucene/java/trunk/src/test/org/apache/lucene/index/wordliststopwords.txt   (with props)
    lucene/java/trunk/src/test/org/apache/lucene/index/wordliststopwords_nocomment.txt   (with props)
Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/StopAnalyzer.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/WordlistLoader.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Sun Jan  3 08:48:17 2010
@@ -105,6 +105,12 @@
   backwards compatibility. If Version < 3.1 is passed to the constructor, 
   LowerCaseFilter yields the old behavior. (Simon Willnauer, Robert Muir)  
 
+* LUCENE-2034: Added ReusableAnalyzerBase, an abstract subclass of Analyzer
+  that makes it easier to reuse TokenStreams correctly. This issue also added
+  StopwordAnalyzerBase, which improves consistency of all Analyzers that use
+  stopwords, and implement many analyzers in contrib with it.  
+  (Simon Willnauer via Robert Muir)
+  
 Optimizations
 
 * LUCENE-2086: When resolving deleted terms, do so in term sort order

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java Sun Jan  3 08:48:17 2010
@@ -19,17 +19,15 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
 import java.io.Reader;
-import java.util.Collections;
 import java.util.Hashtable;
 import java.util.Set;
 
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
 import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.WordlistLoader;
@@ -52,7 +50,7 @@
  * </ul>
  * 
  */
-public final class ArabicAnalyzer extends Analyzer {
+public final class ArabicAnalyzer extends StopwordAnalyzerBase {
 
   /**
    * File containing default Arabic stopwords.
@@ -63,20 +61,17 @@
   public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
 
   /**
-   * Contains the stopwords used with the StopFilter.
-   */
-  private final Set<?> stoptable;
-  /**
    * The comment character in the stopwords file.  All lines prefixed with this will be ignored
    * @deprecated use {@link WordlistLoader#getWordSet(File, String)} directly  
    */
+  // TODO make this private 
   public static final String STOPWORDS_COMMENT = "#";
   
   /**
    * Returns an unmodifiable instance of the default stop-words set.
    * @return an unmodifiable instance of the default stop-words set.
    */
-  public static Set<String> getDefaultStopSet(){
+  public static Set<?> getDefaultStopSet(){
     return DefaultSetHolder.DEFAULT_STOP_SET;
   }
   
@@ -85,34 +80,19 @@
    * accesses the static final set the first time.;
    */
   private static class DefaultSetHolder {
-    static final Set<String> DEFAULT_STOP_SET;
+    static final Set<?> DEFAULT_STOP_SET;
 
     static {
       try {
-        DEFAULT_STOP_SET = loadDefaultStopWordSet();
+        DEFAULT_STOP_SET = loadStopwordSet(false, ArabicAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
       } catch (IOException ex) {
         // default set should always be present as it is part of the
         // distribution (JAR)
         throw new RuntimeException("Unable to load default stopword set");
       }
     }
-
-    static Set<String> loadDefaultStopWordSet() throws IOException {
-      InputStream stream = ArabicAnalyzer.class
-          .getResourceAsStream(DEFAULT_STOPWORD_FILE);
-      try {
-        InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
-        // make sure it is unmodifiable as we expose it in the outer class
-        return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader,
-            STOPWORDS_COMMENT));
-      } finally {
-        stream.close();
-      }
-    }
   }
 
-  private final Version matchVersion;
-
   /**
    * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
    */
@@ -129,8 +109,7 @@
    *          a stopword set
    */
   public ArabicAnalyzer(Version matchVersion, Set<?> stopwords){
-    stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
-    this.matchVersion = matchVersion;
+    super(matchVersion, stopwords);
   }
 
   /**
@@ -159,54 +138,21 @@
 
 
   /**
-   * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
+   * Creates {@link TokenStreamComponents} used to tokenize all the text in the provided {@link Reader}.
    *
-   * @return  A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with
+   * @return {@link TokenStreamComponents} built from an {@link ArabicLetterTokenizer} filtered with
    * 			{@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter}
    *            and {@link ArabicStemFilter}.
    */
   @Override
-  public final TokenStream tokenStream(String fieldName, Reader reader) {
-    TokenStream result = new ArabicLetterTokenizer( reader );
-    result = new LowerCaseFilter(matchVersion, result);
+  protected TokenStreamComponents createComponents(String fieldName,
+      Reader reader) {
+    final Tokenizer source = new ArabicLetterTokenizer(reader);
+    TokenStream result = new LowerCaseFilter(matchVersion, source);
     // the order here is important: the stopword list is not normalized!
-    result = new StopFilter( matchVersion, result, stoptable );
-    result = new ArabicNormalizationFilter( result );
-    result = new ArabicStemFilter( result );
-
-    return result;
-  }
-  
-  private class SavedStreams {
-    Tokenizer source;
-    TokenStream result;
-  };
-  
-  /**
-   * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text 
-   * in the provided {@link Reader}.
-   *
-   * @return  A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with
-   *            {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter}
-   *            and {@link ArabicStemFilter}.
-   */
-  @Override
-  public TokenStream reusableTokenStream(String fieldName, Reader reader)
-      throws IOException {
-    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
-    if (streams == null) {
-      streams = new SavedStreams();
-      streams.source = new ArabicLetterTokenizer(reader);
-      streams.result = new LowerCaseFilter(matchVersion, streams.source);
-      // the order here is important: the stopword list is not normalized!
-      streams.result = new StopFilter( matchVersion, streams.result, stoptable);
-      streams.result = new ArabicNormalizationFilter(streams.result);
-      streams.result = new ArabicStemFilter(streams.result);
-      setPreviousTokenStream(streams);
-    } else {
-      streams.source.reset(reader);
-    }
-    return streams.result;
+    result = new StopFilter( matchVersion, result, stopwords);
+    result = new ArabicNormalizationFilter(result);
+    return new TokenStreamComponents(source, new ArabicStemFilter(result));
   }
 }
 

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java Sun Jan  3 08:48:17 2010
@@ -17,17 +17,16 @@
  * limitations under the License.
  */
 
+import java.io.File;
 import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
 import java.io.Reader;
-import java.util.Collections;
 import java.util.Set;
 
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
 import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.WordlistLoader;
@@ -43,7 +42,7 @@
  * http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf
  * <p>
  */
-public final class BulgarianAnalyzer extends Analyzer {
+public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
   
   /**
    * File containing default Bulgarian stopwords.
@@ -55,13 +54,11 @@
   public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
   
   /**
-   * Contains the stopwords used with the StopFilter.
-   */
-  private final Set<?> stoptable;
-  /**
    * The comment character in the stopwords file. All lines prefixed with this
    * will be ignored
+   * @deprecated use {@link WordlistLoader#getWordSet(File, String)} directly
    */
+  //TODO make this private
   public static final String STOPWORDS_COMMENT = "#";
   
   /**
@@ -69,7 +66,7 @@
    * 
    * @return an unmodifiable instance of the default stop-words set.
    */
-  public static Set<String> getDefaultStopSet() {
+  public static Set<?> getDefaultStopSet() {
     return DefaultSetHolder.DEFAULT_STOP_SET;
   }
   
@@ -78,35 +75,19 @@
    * class accesses the static final set the first time.;
    */
   private static class DefaultSetHolder {
-    static final Set<String> DEFAULT_STOP_SET;
+    static final Set<?> DEFAULT_STOP_SET;
     
     static {
       try {
-        DEFAULT_STOP_SET = loadDefaultStopWordSet();
-      } catch (Exception ex) {
+        DEFAULT_STOP_SET = loadStopwordSet(false, BulgarianAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
+      } catch (IOException ex) {
         // default set should always be present as it is part of the
         // distribution (JAR)
-        throw new RuntimeException("Unable to load default stopword set", ex);
-      }
-    }
-    
-    static Set<String> loadDefaultStopWordSet() throws IOException {
-      final InputStream stream = BulgarianAnalyzer.class
-          .getResourceAsStream(DEFAULT_STOPWORD_FILE);
-      try {
-        InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
-        // make sure it is unmodifiable as we expose it in the outer class
-        return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader,
-            STOPWORDS_COMMENT));
-      } finally {
-        if(stream != null)
-          stream.close();
+        throw new RuntimeException("Unable to load default stopword set");
       }
     }
   }
-  
-  private final Version matchVersion;
-  
+   
   /**
    * Builds an analyzer with the default stop words:
    * {@link #DEFAULT_STOPWORD_FILE}.
@@ -119,58 +100,24 @@
    * Builds an analyzer with the given stop words.
    */
   public BulgarianAnalyzer(Version matchVersion, Set<?> stopwords) {
-    super();
-    stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion,
-        stopwords));
-    this.matchVersion = matchVersion;
+    super(matchVersion, stopwords);
   }
   
   /**
-   * Creates a {@link TokenStream} which tokenizes all the text in the provided
+   * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
    * {@link Reader}.
    * 
-   * @return A {@link TokenStream} built from an {@link StandardTokenizer}
+   * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
    *         filtered with {@link StandardFilter}, {@link LowerCaseFilter},
    *         {@link StopFilter}, and {@link BulgarianStemFilter}.
    */
   @Override
-  public TokenStream tokenStream(String fieldName, Reader reader) {
-    TokenStream result = new StandardTokenizer(matchVersion, reader);
-    result = new StandardFilter(result);
+  public TokenStreamComponents createComponents(String fieldName, Reader reader) {
+    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+    TokenStream result = new StandardFilter(source);
     result = new LowerCaseFilter(matchVersion, result);
-    result = new StopFilter(matchVersion, result, stoptable);
+    result = new StopFilter(matchVersion, result, stopwords);
     result = new BulgarianStemFilter(result);
-    return result;
-  }
-  
-  private class SavedStreams {
-    Tokenizer source;
-    TokenStream result;
-  };
-  
-  /**
-   * Returns a (possibly reused) {@link TokenStream} which tokenizes all the
-   * text in the provided {@link Reader}.
-   * 
-   * @return A {@link TokenStream} built from an {@link StandardTokenizer}
-   *         filtered with {@link StandardFilter}, {@link LowerCaseFilter},
-   *         {@link StopFilter}, and {@link BulgarianStemFilter}.
-   */
-  @Override
-  public TokenStream reusableTokenStream(String fieldName, Reader reader)
-      throws IOException {
-    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
-    if (streams == null) {
-      streams = new SavedStreams();
-      streams.source = new StandardTokenizer(matchVersion, reader);
-      streams.result = new StandardFilter(streams.source);
-      streams.result = new LowerCaseFilter(matchVersion, streams.result);
-      streams.result = new StopFilter(matchVersion, streams.result, stoptable);
-      streams.result = new BulgarianStemFilter(streams.result);
-      setPreviousTokenStream(streams);
-    } else {
-      streams.source.reset(reader);
-    }
-    return streams.result;
+    return new TokenStreamComponents(source, result);
   }
 }

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java Sun Jan  3 08:48:17 2010
@@ -21,19 +21,21 @@
 import java.io.IOException;
 import java.io.Reader;
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.Map;
 import java.util.Set;
-import java.util.Collections;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
 import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.WordlistLoader;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;  // for javadoc
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.standard.StandardFilter;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.util.Version;
@@ -49,7 +51,7 @@
  * <p><b>NOTE</b>: This class uses the same {@link Version}
  * dependent settings as {@link StandardAnalyzer}.</p>
  */
-public final class BrazilianAnalyzer extends Analyzer {
+public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
 
 	/**
 	 * List of typical Brazilian Portuguese stopwords.
@@ -91,19 +93,13 @@
             Arrays.asList(BRAZILIAN_STOP_WORDS), false));
   }
 
-	/**
-	 * Contains the stopwords used with the {@link StopFilter}.
-	 */
-	private final Set<?> stoptable;
-	
+
 	/**
 	 * Contains words that should be indexed but not stemmed.
 	 */
 	// TODO make this private in 3.1
 	private Set<?> excltable = Collections.emptySet();
 	
-  private final Version matchVersion;
-
 	/**
 	 * Builds an analyzer with the default stop words ({@link #BRAZILIAN_STOP_WORDS}).
 	 */
@@ -120,8 +116,7 @@
    *          a stopword set
    */
   public BrazilianAnalyzer(Version matchVersion, Set<?> stopwords) {
-    stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
-    this.matchVersion = matchVersion;
+     super(matchVersion, stopwords);
   }
 
   /**
@@ -188,53 +183,22 @@
 		excltable = WordlistLoader.getWordSet( exclusionlist );
 		setPreviousTokenStream(null); // force a new stemmer to be created
 	}
-
-	/**
-	 * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
-	 *
-	 * @return  A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
-	 * 			{@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and 
-	 *          {@link BrazilianStemFilter}.
-	 */
-	@Override
-	public final TokenStream tokenStream(String fieldName, Reader reader) {
-                TokenStream result = new StandardTokenizer( matchVersion, reader );
-		result = new LowerCaseFilter( matchVersion, result );
-		result = new StandardFilter( result );
-		result = new StopFilter( matchVersion, result, stoptable );
-		result = new BrazilianStemFilter( result, excltable );
-		return result;
-	}
-	
-    private class SavedStreams {
-      Tokenizer source;
-      TokenStream result;
-    };
-    
-    /**
-     * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text 
-     * in the provided {@link Reader}.
-     *
-     * @return  A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
-     *          {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and 
-     *          {@link BrazilianStemFilter}.
-     */
-    @Override
-    public TokenStream reusableTokenStream(String fieldName, Reader reader)
-      throws IOException {
-      SavedStreams streams = (SavedStreams) getPreviousTokenStream();
-      if (streams == null) {
-        streams = new SavedStreams();
-        streams.source = new StandardTokenizer(matchVersion, reader);
-        streams.result = new LowerCaseFilter(matchVersion, streams.source);
-        streams.result = new StandardFilter(streams.result);
-        streams.result = new StopFilter(matchVersion, streams.result, stoptable);
-        streams.result = new BrazilianStemFilter(streams.result, excltable);
-        setPreviousTokenStream(streams);
-      } else {
-        streams.source.reset(reader);
-      }
-      return streams.result;
-    }
+  /**
+   * Creates {@link TokenStreamComponents} used to tokenize all the text in the provided {@link Reader}.
+   *
+   * @return  {@link TokenStreamComponents} built from a {@link StandardTokenizer} filtered with
+   *      {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and 
+   *          {@link BrazilianStemFilter}.
+   */
+  @Override
+  protected TokenStreamComponents createComponents(String fieldName,
+      Reader reader) {
+    Tokenizer source = new StandardTokenizer(matchVersion, reader);
+    TokenStream result = new LowerCaseFilter(matchVersion, source);
+    result = new StandardFilter(result);
+    result = new StopFilter(matchVersion, result, stopwords);
+    return new TokenStreamComponents(source, new BrazilianStemFilter(result,
+        excltable));
+  }
 }
 

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java Sun Jan  3 08:48:17 2010
@@ -19,12 +19,12 @@
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
 import org.apache.lucene.analysis.StopFilter;
-import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.util.Version;
 
-import java.io.IOException;
 import java.io.Reader;
 import java.util.Arrays;
 import java.util.Set;
@@ -35,7 +35,7 @@
  * filters with {@link StopFilter}
  *
  */
-public final class CJKAnalyzer extends Analyzer {
+public final class CJKAnalyzer extends StopwordAnalyzerBase {
   //~ Static fields/initializers ---------------------------------------------
 
   /**
@@ -71,11 +71,6 @@
         .unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(STOP_WORDS),
             false));
   }
-  /**
-   * stop word list
-   */
-  private final Set<?> stopTable;
-  private final Version matchVersion;
 
   //~ Constructors -----------------------------------------------------------
 
@@ -95,8 +90,7 @@
    *          a stopword set
    */
   public CJKAnalyzer(Version matchVersion, Set<?> stopwords){
-    stopTable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
-    this.matchVersion = matchVersion;
+    super(matchVersion, stopwords);
   }
 
   /**
@@ -106,51 +100,15 @@
    * @deprecated use {@link #CJKAnalyzer(Version, Set)} instead
    */
   public CJKAnalyzer(Version matchVersion, String... stopWords) {
-    stopTable = StopFilter.makeStopSet(matchVersion, stopWords);
-    this.matchVersion = matchVersion;
+    super(matchVersion, StopFilter.makeStopSet(matchVersion, stopWords));
   }
 
   //~ Methods ----------------------------------------------------------------
 
-  /**
-   * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
-   *
-   * @param fieldName lucene field name
-   * @param reader    input {@link Reader}
-   * @return A {@link TokenStream} built from {@link CJKTokenizer}, filtered with
-   *    {@link StopFilter}
-   */
-  @Override
-  public final TokenStream tokenStream(String fieldName, Reader reader) {
-    return new StopFilter(matchVersion, new CJKTokenizer(reader), stopTable);
-  }
-  
-  private class SavedStreams {
-    Tokenizer source;
-    TokenStream result;
-  };
-  
-  /**
-   * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text 
-   * in the provided {@link Reader}.
-   *
-   * @param fieldName lucene field name
-   * @param reader    Input {@link Reader}
-   * @return A {@link TokenStream} built from {@link CJKTokenizer}, filtered with
-   *    {@link StopFilter}
-   */
   @Override
-  public final TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
-    /* tokenStream() is final, no back compat issue */
-    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
-    if (streams == null) {
-      streams = new SavedStreams();
-      streams.source = new CJKTokenizer(reader);
-      streams.result = new StopFilter(matchVersion, streams.source, stopTable);
-      setPreviousTokenStream(streams);
-    } else {
-      streams.source.reset(reader);
-    }
-    return streams.result;
+  protected TokenStreamComponents createComponents(String fieldName,
+      Reader reader) {
+    final Tokenizer source = new CJKTokenizer(reader);
+    return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords));
   }
 }

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java Sun Jan  3 08:48:17 2010
@@ -25,8 +25,6 @@
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.util.AttributeSource;
-import org.apache.lucene.util.AttributeSource.AttributeFactory;
-
 
 /**
  * CJKTokenizer is designed for Chinese, Japanese, and Korean languages.

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java Sun Jan  3 08:48:17 2010
@@ -17,10 +17,11 @@
  * limitations under the License.
  */
 
-import java.io.IOException;
 import java.io.Reader;
+
+import org.apache.lucene.analysis.ReusableAnalyzerBase;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 
 /**
@@ -29,49 +30,19 @@
  *
  */
 
-public final class ChineseAnalyzer extends Analyzer {
-
-    public ChineseAnalyzer() {
-    }
-
-    /**
-    * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
-    *
-    * @return  A {@link TokenStream} built from a {@link ChineseTokenizer} 
-    *   filtered with {@link ChineseFilter}.
-    */
-    @Override
-    public final TokenStream tokenStream(String fieldName, Reader reader) {
-        TokenStream result = new ChineseTokenizer(reader);
-        result = new ChineseFilter(result);
-        return result;
-    }
-    
-    private class SavedStreams {
-      Tokenizer source;
-      TokenStream result;
-    };
+public final class ChineseAnalyzer extends ReusableAnalyzerBase {
 
-    /**
-    * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text in the
-    * provided {@link Reader}.
-    * 
-    * @return A {@link TokenStream} built from a {@link ChineseTokenizer} 
-    *   filtered with {@link ChineseFilter}.
-    */
+  /**
+   * Creates {@link TokenStreamComponents} used to tokenize all the text in the
+   * provided {@link Reader}.
+   * 
+   * @return {@link TokenStreamComponents} built from a
+   *         {@link ChineseTokenizer} filtered with {@link ChineseFilter}
+   */
     @Override
-    public final TokenStream reusableTokenStream(String fieldName, Reader reader)
-      throws IOException {
-      /* tokenStream() is final, no back compat issue */
-      SavedStreams streams = (SavedStreams) getPreviousTokenStream();
-      if (streams == null) {
-        streams = new SavedStreams();
-        streams.source = new ChineseTokenizer(reader);
-        streams.result = new ChineseFilter(streams.source);
-        setPreviousTokenStream(streams);
-      } else {
-        streams.source.reset(reader);
-      }
-      return streams.result;
+    protected TokenStreamComponents createComponents(String fieldName,
+        Reader reader) {
+      final Tokenizer source = new ChineseTokenizer(reader);
+      return new TokenStreamComponents(source, new ChineseFilter(source));
     }
 }
\ No newline at end of file

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java Sun Jan  3 08:48:17 2010
@@ -17,6 +17,8 @@
  * limitations under the License.
  */
 
+import org.apache.lucene.analysis.ReusableAnalyzerBase;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.LowerCaseFilter;
@@ -30,9 +32,9 @@
 
 import java.io.*;
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.Set;
-import java.util.Collections;
 
 /**
  * {@link Analyzer} for Czech language.
@@ -53,7 +55,7 @@
  * <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
  * </ul>
  */
-public final class CzechAnalyzer extends Analyzer {
+public final class CzechAnalyzer extends ReusableAnalyzerBase {
 
   /**
 	 * List of typical stopwords.
@@ -95,10 +97,11 @@
 	      Version.LUCENE_CURRENT, Arrays.asList(CZECH_STOP_WORDS), false));
 	}
 
+ 
   /**
    * Contains the stopwords used with the {@link StopFilter}.
    */
-	// TODO make this final in 3.1
+	// TODO once loadStopWords is gone those member should be removed too in favor of StopwordAnalyzerBase
 	private Set<?> stoptable;
   private final Version matchVersion;
 
@@ -168,6 +171,7 @@
      * @deprecated use {@link WordlistLoader#getWordSet(Reader, String) }
      *             and {@link #CzechAnalyzer(Version, Set)} instead
      */
+    // TODO extend StopwordAnalyzerBase once this method is gone!
     public void loadStopWords( InputStream wordfile, String encoding ) {
         setPreviousTokenStream(null); // force a new stopfilter to be created
         if ( wordfile == null ) {
@@ -191,58 +195,25 @@
           stoptable = Collections.emptySet();
         }
     }
-
   /**
-   * Creates a {@link TokenStream} which tokenizes all the text in the provided
+   * Creates {@link TokenStreamComponents} used to tokenize all the text in the provided
    * {@link Reader}.
    * 
-   * @return A {@link TokenStream} built from a {@link StandardTokenizer}
+   * @return {@link TokenStreamComponents} built from a {@link StandardTokenizer}
    *         filtered with {@link StandardFilter}, {@link LowerCaseFilter},
    *         {@link StopFilter}, and {@link CzechStemFilter} (only if version is
    *         >= LUCENE_31)
    */
   @Override
-	public final TokenStream tokenStream( String fieldName, Reader reader ) {
-                TokenStream result = new StandardTokenizer( matchVersion, reader );
-		result = new StandardFilter( result );
-		result = new LowerCaseFilter( matchVersion, result );
-		result = new StopFilter( matchVersion, result, stoptable );
-		if (matchVersion.onOrAfter(Version.LUCENE_31))
-		  result = new CzechStemFilter(result);
-		return result;
-	}
-	
-	private class SavedStreams {
-	    Tokenizer source;
-	    TokenStream result;
-	};
-	
-  /**
-   * Returns a (possibly reused) {@link TokenStream} which tokenizes all the
-   * text in the provided {@link Reader}.
-   * 
-   * @return A {@link TokenStream} built from a {@link StandardTokenizer}
-   *         filtered with {@link StandardFilter}, {@link LowerCaseFilter},
-   *         {@link StopFilter}, and {@link CzechStemFilter} (only if version is
-   *         >= LUCENE_31)
-   */
-	@Override
-	public TokenStream reusableTokenStream(String fieldName, Reader reader)
-      throws IOException {
-      SavedStreams streams = (SavedStreams) getPreviousTokenStream();
-      if (streams == null) {
-        streams = new SavedStreams();
-        streams.source = new StandardTokenizer(matchVersion, reader);
-        streams.result = new StandardFilter(streams.source);
-        streams.result = new LowerCaseFilter(matchVersion, streams.result);
-        streams.result = new StopFilter( matchVersion, streams.result, stoptable);
-        if (matchVersion.onOrAfter(Version.LUCENE_31))
-          streams.result = new CzechStemFilter(streams.result);
-        setPreviousTokenStream(streams);
-      } else {
-        streams.source.reset(reader);
-      }
-      return streams.result;
-    }
+  protected TokenStreamComponents createComponents(String fieldName,
+      Reader reader) {
+    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+    TokenStream result = new StandardFilter(source);
+    result = new LowerCaseFilter(matchVersion, result);
+    result = new StopFilter( matchVersion, result, stoptable);
+    if (matchVersion.onOrAfter(Version.LUCENE_31))
+      result = new CzechStemFilter(result);
+    return new TokenStreamComponents(source, result);
+  }
 }
 

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java Sun Jan  3 08:48:17 2010
@@ -29,13 +29,15 @@
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
 import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.standard.StandardFilter;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;  // for javadoc
 import org.apache.lucene.util.Version;
 
 /**
@@ -51,7 +53,7 @@
  * <p><b>NOTE</b>: This class uses the same {@link Version}
  * dependent settings as {@link StandardAnalyzer}.</p>
  */
-public final class GermanAnalyzer extends Analyzer {
+public final class GermanAnalyzer extends StopwordAnalyzerBase {
   
   /**
    * List of typical german stopwords.
@@ -89,17 +91,13 @@
   /**
    * Contains the stopwords used with the {@link StopFilter}.
    */
-  //TODO make this final in 3.1
-  private Set<?> stopSet;
-
+ 
   /**
    * Contains words that should be indexed but not stemmed.
    */
   // TODO make this final in 3.1
   private Set<?> exclusionSet;
 
-  private final Version matchVersion;
-
   /**
    * Builds an analyzer with the default stop words:
    * {@link #getDefaultStopSet()}.
@@ -131,9 +129,8 @@
    *          a stemming exclusion set
    */
   public GermanAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
-    stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
+    super(matchVersion, stopwords);
     exclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
-    this.matchVersion = matchVersion;
   }
 
   /**
@@ -187,51 +184,23 @@
     exclusionSet = WordlistLoader.getWordSet(exclusionlist);
     setPreviousTokenStream(null); // force a new stemmer to be created
   }
-
-  /**
-   * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
-   *
-   * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
-   *         {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}, and
-   *         {@link GermanStemFilter}
-   */
-  @Override
-  public TokenStream tokenStream(String fieldName, Reader reader) {
-    TokenStream result = new StandardTokenizer(matchVersion, reader);
-    result = new StandardFilter(result);
-    result = new LowerCaseFilter(matchVersion, result);
-    result = new StopFilter( matchVersion, result, stopSet);
-    result = new GermanStemFilter(result, exclusionSet);
-    return result;
-  }
-  
-  private class SavedStreams {
-    Tokenizer source;
-    TokenStream result;
-  };
   
   /**
-   * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text 
-   * in the provided {@link Reader}.
-   *
-   * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
-   *         {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}, and
+   * Creates {@link TokenStreamComponents} used to tokenize all the text in the
+   * provided {@link Reader}.
+   * 
+   * @return {@link TokenStreamComponents} built from a
+   *         {@link StandardTokenizer} filtered with {@link StandardFilter},
+   *         {@link LowerCaseFilter}, {@link StopFilter}, and
    *         {@link GermanStemFilter}
    */
   @Override
-  public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
-    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
-    if (streams == null) {
-      streams = new SavedStreams();
-      streams.source = new StandardTokenizer(matchVersion, reader);
-      streams.result = new StandardFilter(streams.source);
-      streams.result = new LowerCaseFilter(matchVersion, streams.result);
-      streams.result = new StopFilter( matchVersion, streams.result, stopSet);
-      streams.result = new GermanStemFilter(streams.result, exclusionSet);
-      setPreviousTokenStream(streams);
-    } else {
-      streams.source.reset(reader);
-    }
-    return streams.result;
+  protected TokenStreamComponents createComponents(String fieldName,
+      Reader reader) {
+    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+    TokenStream result = new StandardFilter(source);
+    result = new LowerCaseFilter(matchVersion, result);
+    result = new StopFilter( matchVersion, result, stopwords);
+    return new TokenStreamComponents(source, new GermanStemFilter(result, exclusionSet));
   }
 }

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java Sun Jan  3 08:48:17 2010
@@ -19,14 +19,15 @@
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
 import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;  // for javadoc
 import org.apache.lucene.util.Version;
 
-import java.io.IOException;
 import java.io.Reader;
 import java.util.Arrays;
 import java.util.Map;
@@ -43,7 +44,7 @@
  * <p><b>NOTE</b>: This class uses the same {@link Version}
  * dependent settings as {@link StandardAnalyzer}.</p>
  */
-public final class GreekAnalyzer extends Analyzer
+public final class GreekAnalyzer extends StopwordAnalyzerBase
 {
     /**
      * List of typical Greek stopwords.
@@ -73,13 +74,6 @@
           Version.LUCENE_CURRENT, Arrays.asList(GREEK_STOP_WORDS), false));
     }
 
-    /**
-     * Contains the stopwords used with the {@link StopFilter}.
-     */
-    private final Set<?> stopSet;
-
-    private final Version matchVersion;
-
     public GreekAnalyzer(Version matchVersion) {
       this(matchVersion, DefaultSetHolder.DEFAULT_SET);
     }
@@ -93,8 +87,7 @@
      *          a stopword set
      */
     public GreekAnalyzer(Version matchVersion, Set<?> stopwords) {
-      stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
-      this.matchVersion = matchVersion;
+      super(matchVersion, stopwords);
     }
 
     /**
@@ -115,47 +108,20 @@
     {
       this(matchVersion, stopwords.keySet());
     }
-
-    /**
-     * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
-     *
-     * @return  A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
-     *                  {@link GreekLowerCaseFilter} and {@link StopFilter}
-     */
-    @Override
-    public TokenStream tokenStream(String fieldName, Reader reader)
-    {
-        TokenStream result = new StandardTokenizer(matchVersion, reader);
-        result = new GreekLowerCaseFilter(result);
-        result = new StopFilter(matchVersion, result, stopSet);
-        return result;
-    }
-    
-    private class SavedStreams {
-      Tokenizer source;
-      TokenStream result;
-    };
-    
-    /**
-     * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text 
-     * in the provided {@link Reader}.
-     *
-     * @return  A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
-     *                  {@link GreekLowerCaseFilter} and {@link StopFilter}
-     */
+  
+   /**
+    * Creates {@link TokenStreamComponents} used to tokenize all the text in the
+    * provided {@link Reader}.
+    * 
+    * @return {@link TokenStreamComponents} built from a
+    *         {@link StandardTokenizer} filtered with
+    *         {@link GreekLowerCaseFilter} and {@link StopFilter}
+    */
     @Override
-    public TokenStream reusableTokenStream(String fieldName, Reader reader) 
-      throws IOException {
-      SavedStreams streams = (SavedStreams) getPreviousTokenStream();
-      if (streams == null) {
-        streams = new SavedStreams();
-        streams.source = new StandardTokenizer(matchVersion, reader);
-        streams.result = new GreekLowerCaseFilter(streams.source);
-        streams.result = new StopFilter(matchVersion, streams.result, stopSet);
-        setPreviousTokenStream(streams);
-      } else {
-        streams.source.reset(reader);
-      }
-      return streams.result;
+    protected TokenStreamComponents createComponents(String fieldName,
+        Reader reader) {
+      final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+      final TokenStream result = new GreekLowerCaseFilter(source);
+      return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
     }
 }

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java Sun Jan  3 08:48:17 2010
@@ -19,17 +19,15 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
 import java.io.Reader;
-import java.util.Collections;
 import java.util.Hashtable;
 import java.util.Set;
 
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
 import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.WordlistLoader;
@@ -45,7 +43,7 @@
  * yeh and keheh) are standardized. "Stemming" is accomplished via stopwords.
  * </p>
  */
-public final class PersianAnalyzer extends Analyzer {
+public final class PersianAnalyzer extends StopwordAnalyzerBase {
 
   /**
    * File containing default Persian stopwords.
@@ -58,11 +56,6 @@
   public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
 
   /**
-   * Contains the stopwords used with the StopFilter.
-   */
-  private final Set<?> stoptable;
-
-  /**
    * The comment character in the stopwords file. All lines prefixed with this
    * will be ignored
    */
@@ -85,30 +78,15 @@
 
     static {
       try {
-        DEFAULT_STOP_SET = loadDefaultStopWordSet();
+        DEFAULT_STOP_SET = loadStopwordSet(false, PersianAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
       } catch (IOException ex) {
         // default set should always be present as it is part of the
         // distribution (JAR)
         throw new RuntimeException("Unable to load default stopword set");
       }
     }
-
-    static Set<String> loadDefaultStopWordSet() throws IOException {
-      InputStream stream = PersianAnalyzer.class
-          .getResourceAsStream(DEFAULT_STOPWORD_FILE);
-      try {
-        InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
-        // make sure it is unmodifiable as we expose it in the outer class
-        return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader,
-            STOPWORDS_COMMENT));
-      } finally {
-        stream.close();
-      }
-    }
   }
 
-  private final Version matchVersion;
-
   /**
    * Builds an analyzer with the default stop words:
    * {@link #DEFAULT_STOPWORD_FILE}.
@@ -126,8 +104,7 @@
    *          a stopword set
    */
   public PersianAnalyzer(Version matchVersion, Set<?> stopwords){
-    stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
-    this.matchVersion = matchVersion;
+    super(matchVersion, stopwords);
   }
 
   /**
@@ -156,18 +133,19 @@
   }
 
   /**
-   * Creates a {@link TokenStream} which tokenizes all the text in the provided
+   * Creates {@link TokenStreamComponents} used to tokenize all the text in the provided
    * {@link Reader}.
    * 
-   * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
+   * @return {@link TokenStreamComponents} built from a {@link ArabicLetterTokenizer}
    *         filtered with {@link LowerCaseFilter}, 
    *         {@link ArabicNormalizationFilter},
    *         {@link PersianNormalizationFilter} and Persian Stop words
    */
   @Override
-  public TokenStream tokenStream(String fieldName, Reader reader) {
-    TokenStream result = new ArabicLetterTokenizer(reader);
-    result = new LowerCaseFilter(matchVersion, result);
+  protected TokenStreamComponents createComponents(String fieldName,
+      Reader reader) {
+    final Tokenizer source = new ArabicLetterTokenizer(reader);
+    TokenStream result = new LowerCaseFilter(matchVersion, source);
     result = new ArabicNormalizationFilter(result);
     /* additional persian-specific normalization */
     result = new PersianNormalizationFilter(result);
@@ -175,44 +153,6 @@
      * the order here is important: the stopword list is normalized with the
      * above!
      */
-    result = new StopFilter(matchVersion, result, stoptable);
-    return result;
-  }
-  
-  private class SavedStreams {
-    Tokenizer source;
-    TokenStream result;
-  }
-
-  /**
-   * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text 
-   * in the provided {@link Reader}.
-   * 
-   * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
-   *         filtered with {@link LowerCaseFilter}, 
-   *         {@link ArabicNormalizationFilter},
-   *         {@link PersianNormalizationFilter} and Persian Stop words
-   */
-  @Override
-  public TokenStream reusableTokenStream(String fieldName, Reader reader)
-      throws IOException {
-    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
-    if (streams == null) {
-      streams = new SavedStreams();
-      streams.source = new ArabicLetterTokenizer(reader);
-      streams.result = new LowerCaseFilter(matchVersion, streams.source);
-      streams.result = new ArabicNormalizationFilter(streams.result);
-      /* additional persian-specific normalization */
-      streams.result = new PersianNormalizationFilter(streams.result);
-      /*
-       * the order here is important: the stopword list is normalized with the
-       * above!
-       */
-      streams.result = new StopFilter(matchVersion, streams.result, stoptable);
-      setPreviousTokenStream(streams);
-    } else {
-      streams.source.reset(reader);
-    }
-    return streams.result;
+    return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
   }
 }

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java Sun Jan  3 08:48:17 2010
@@ -20,7 +20,9 @@
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
 import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.WordlistLoader;
@@ -59,7 +61,7 @@
  * <p><b>NOTE</b>: This class uses the same {@link Version}
  * dependent settings as {@link StandardAnalyzer}.</p>
  */
-public final class FrenchAnalyzer extends Analyzer {
+public final class FrenchAnalyzer extends StopwordAnalyzerBase {
 
   /**
    * Extended list of typical French stopwords.
@@ -92,17 +94,11 @@
   };
 
   /**
-   * Contains the stopwords used with the {@link StopFilter}.
-   */
-  private final Set<?> stoptable;
-  /**
    * Contains words that should be indexed but not stemmed.
    */
   //TODO make this final in 3.0
   private Set<?> excltable = Collections.<Object>emptySet();
 
-  private final Version matchVersion;
-  
   /**
    * Returns an unmodifiable instance of the default stop-words set.
    * @return an unmodifiable instance of the default stop-words set.
@@ -148,9 +144,7 @@
    */
   public FrenchAnalyzer(Version matchVersion, Set<?> stopwords,
       Set<?> stemExclutionSet) {
-    this.matchVersion = matchVersion;
-    this.stoptable = CharArraySet.unmodifiableSet(CharArraySet
-        .copy(matchVersion, stopwords));
+    super(matchVersion, stopwords);
     this.excltable = CharArraySet.unmodifiableSet(CharArraySet
         .copy(matchVersion, stemExclutionSet));
   }
@@ -202,54 +196,22 @@
   }
 
   /**
-   * Creates a {@link TokenStream} which tokenizes all the text in the provided
+   * Creates {@link TokenStreamComponents} used to tokenize all the text in the provided
    * {@link Reader}.
    *
-   * @return A {@link TokenStream} built from a {@link StandardTokenizer} 
+   * @return {@link TokenStreamComponents} built from a {@link StandardTokenizer} 
    *         filtered with {@link StandardFilter}, {@link StopFilter}, 
    *         {@link FrenchStemFilter} and {@link LowerCaseFilter}
    */
   @Override
-  public final TokenStream tokenStream(String fieldName, Reader reader) {
-    TokenStream result = new StandardTokenizer(matchVersion, reader);
-    result = new StandardFilter(result);
-    result = new StopFilter(matchVersion, result, stoptable);
+  protected TokenStreamComponents createComponents(String fieldName,
+      Reader reader) {
+    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+    TokenStream result = new StandardFilter(source);
+    result = new StopFilter(matchVersion, result, stopwords);
     result = new FrenchStemFilter(result, excltable);
     // Convert to lowercase after stemming!
-    result = new LowerCaseFilter(matchVersion, result);
-    return result;
-  }
-  
-  private class SavedStreams {
-    Tokenizer source;
-    TokenStream result;
-  };
-  
-  /**
-   * Returns a (possibly reused) {@link TokenStream} which tokenizes all the 
-   * text in the provided {@link Reader}.
-   *
-   * @return A {@link TokenStream} built from a {@link StandardTokenizer} 
-   *         filtered with {@link StandardFilter}, {@link StopFilter}, 
-   *         {@link FrenchStemFilter} and {@link LowerCaseFilter}
-   */
-  @Override
-  public TokenStream reusableTokenStream(String fieldName, Reader reader)
-      throws IOException {
-    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
-    if (streams == null) {
-      streams = new SavedStreams();
-      streams.source = new StandardTokenizer(matchVersion, reader);
-      streams.result = new StandardFilter(streams.source);
-      streams.result = new StopFilter(matchVersion, streams.result, stoptable);
-      streams.result = new FrenchStemFilter(streams.result, excltable);
-      // Convert to lowercase after stemming!
-      streams.result = new LowerCaseFilter(matchVersion, streams.result);
-      setPreviousTokenStream(streams);
-    } else {
-      streams.source.reset(reader);
-    }
-    return streams.result;
+    return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result));
   }
 }
 

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java Sun Jan  3 08:48:17 2010
@@ -17,7 +17,6 @@
  * limitations under the License.
  */
 
-import java.io.IOException;
 import java.io.Reader;
 import java.util.Arrays;
 import java.util.Map;
@@ -26,7 +25,9 @@
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
 import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.util.Version;
@@ -39,7 +40,7 @@
  * A default set of stopwords is used unless an alternative list is specified.
  * </p>
  */
-public final class RussianAnalyzer extends Analyzer
+public final class RussianAnalyzer extends StopwordAnalyzerBase
 {
     /**
      * List of typical Russian stopwords.
@@ -63,13 +64,6 @@
               Arrays.asList(RUSSIAN_STOP_WORDS), false));
     }
 
-    /**
-     * Contains the stopwords used with the StopFilter.
-     */
-    private final Set<?> stopSet;
-
-    private final Version matchVersion;
-
     public RussianAnalyzer(Version matchVersion) {
       this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
     }
@@ -91,8 +85,7 @@
      *          a stopword set
      */
     public RussianAnalyzer(Version matchVersion, Set<?> stopwords){
-      stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
-      this.matchVersion = matchVersion;
+      super(matchVersion, stopwords);
     }
    
     /**
@@ -106,52 +99,21 @@
     }
 
     /**
-     * Creates a {@link TokenStream} which tokenizes all the text in the 
+     * Creates {@link TokenStreamComponents} used to tokenize all the text in the 
      * provided {@link Reader}.
      *
-     * @return  A {@link TokenStream} built from a 
-     *   {@link RussianLetterTokenizer} filtered with 
-     *   {@link LowerCaseFilter}, {@link StopFilter}, 
-     *   and {@link RussianStemFilter}
-     */
-    @Override
-    public TokenStream tokenStream(String fieldName, Reader reader)
-    {
-        TokenStream result = new RussianLetterTokenizer(reader);
-        result = new LowerCaseFilter(matchVersion, result);
-        result = new StopFilter(matchVersion, result, stopSet);
-        result = new RussianStemFilter(result);
-        return result;
-    }
-    
-    private class SavedStreams {
-      Tokenizer source;
-      TokenStream result;
-    };
-    
-    /**
-     * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text 
-     * in the provided {@link Reader}.
-     *
-     * @return  A {@link TokenStream} built from a 
+     * @return {@link TokenStreamComponents} built from a 
      *   {@link RussianLetterTokenizer} filtered with 
      *   {@link LowerCaseFilter}, {@link StopFilter}, 
      *   and {@link RussianStemFilter}
      */
     @Override
-    public TokenStream reusableTokenStream(String fieldName, Reader reader) 
-      throws IOException {
-    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
-    if (streams == null) {
-      streams = new SavedStreams();
-      streams.source = new RussianLetterTokenizer(reader);
-      streams.result = new LowerCaseFilter(matchVersion, streams.source);
-      streams.result = new StopFilter(matchVersion, streams.result, stopSet);
-      streams.result = new RussianStemFilter(streams.result);
-      setPreviousTokenStream(streams);
-    } else {
-      streams.source.reset(reader);
+    protected TokenStreamComponents createComponents(String fieldName,
+        Reader reader) {
+      final Tokenizer source = new RussianLetterTokenizer(reader);
+      TokenStream result = new LowerCaseFilter(matchVersion, source);
+      result = new StopFilter(matchVersion, result, stopwords);
+      return new TokenStreamComponents(source, new RussianStemFilter(result));
+      
     }
-    return streams.result;
-  }
 }

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java Sun Jan  3 08:48:17 2010
@@ -16,16 +16,18 @@
  * limitations under the License.
  */
 
-import java.io.IOException;
 import java.io.Reader;
+
+import org.apache.lucene.analysis.ReusableAnalyzerBase;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.StopAnalyzer;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.standard.StandardFilter;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;  // for javadoc
 import org.apache.lucene.util.Version;
 
 /**
@@ -35,41 +37,28 @@
  * <p><b>NOTE</b>: This class uses the same {@link Version}
  * dependent settings as {@link StandardAnalyzer}.</p>
  */
-public final class ThaiAnalyzer extends Analyzer {
+public final class ThaiAnalyzer extends ReusableAnalyzerBase {
   private final Version matchVersion;
 
   public ThaiAnalyzer(Version matchVersion) {
     this.matchVersion = matchVersion;
   }
-  
-  @Override
-  public TokenStream tokenStream(String fieldName, Reader reader) {
-    TokenStream ts = new StandardTokenizer(matchVersion, reader);
-    ts = new StandardFilter(ts);
-    ts = new ThaiWordFilter(ts);
-    ts = new StopFilter(matchVersion, ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
-    return ts;
-  }
-  
-  private class SavedStreams {
-    Tokenizer source;
-    TokenStream result;
-  };
-  
+
+  /**
+   * Creates {@link TokenStreamComponents} used to tokenize all the text in the
+   * provided {@link Reader}.
+   * 
+   * @return {@link TokenStreamComponents} built from a
+   *         {@link StandardTokenizer} filtered with {@link StandardFilter},
+   *         {@link ThaiWordFilter}, and {@link StopFilter}
+   */
   @Override
-  public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
-    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
-    if (streams == null) {
-      streams = new SavedStreams();
-      streams.source = new StandardTokenizer(matchVersion, reader);
-      streams.result = new StandardFilter(streams.source);
-      streams.result = new ThaiWordFilter(streams.result);
-      streams.result = new StopFilter(matchVersion, streams.result, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
-      setPreviousTokenStream(streams);
-    } else {
-      streams.source.reset(reader);
-      streams.result.reset(); // reset the ThaiWordFilter's state
-    }
-    return streams.result;
+  protected TokenStreamComponents createComponents(String fieldName,
+      Reader reader) {
+    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+    TokenStream result = new StandardFilter(source);
+    result = new ThaiWordFilter(result);
+    return new TokenStreamComponents(source, new StopFilter(matchVersion,
+        result, StopAnalyzer.ENGLISH_STOP_WORDS_SET));
   }
 }

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java Sun Jan  3 08:48:17 2010
@@ -17,10 +17,10 @@
  * limitations under the License.
  */
 
-import java.io.StringReader;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
 
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.util.Version;
 
@@ -78,7 +78,9 @@
    * Test that custom stopwords work, and are not case-sensitive.
    */
   public void testCustomStopwords() throws Exception {
-    ArabicAnalyzer a = new ArabicAnalyzer(Version.LUCENE_CURRENT, new String[] { "the", "and", "a" });
+    Set<String> set = new HashSet<String>();
+    Collections.addAll(set, "the", "and", "a");
+    ArabicAnalyzer a = new ArabicAnalyzer(Version.LUCENE_CURRENT, set);
     assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick",
         "brown", "fox" });
   }

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java Sun Jan  3 08:48:17 2010
@@ -17,10 +17,12 @@
  * limitations under the License.
  */
 
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.util.Version;
 
 /**

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java Sun Jan  3 08:48:17 2010
@@ -17,11 +17,8 @@
  * limitations under the License.
  */
 
-import java.io.StringReader;
-
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.util.Version;
 
 /**

Added: lucene/java/trunk/src/java/org/apache/lucene/analysis/ReusableAnalyzerBase.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/ReusableAnalyzerBase.java?rev=895339&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/ReusableAnalyzerBase.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/ReusableAnalyzerBase.java Sun Jan  3 08:48:17 2010
@@ -0,0 +1,163 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * An convenience subclass of Analyzer that makes it easy to implement
+ * {@link TokenStream} reuse.
+ * <p>
+ * ReusableAnalyzerBase is a simplification of Analyzer that supports easy reuse
+ * for the most common use-cases. Analyzers such as
+ * {@link PerFieldAnalyzerWrapper} that behave differently depending upon the
+ * field name need to subclass Analyzer directly instead.
+ * </p>
+ * <p>
+ * To prevent consistency problems, this class does not allow subclasses to
+ * extend {@link #reusableTokenStream(String, Reader)} or
+ * {@link #tokenStream(String, Reader)} directly. Instead, subclasses must
+ * implement {@link #createComponents(String, Reader)}.
+ * </p>
+ */
+public abstract class ReusableAnalyzerBase extends Analyzer {
+
+  /**
+   * Creates a new {@link TokenStreamComponents} instance for this analyzer.
+   * 
+   * @param fieldName
+   *          the name of the fields content passed to the
+   *          {@link TokenStreamComponents} sink as a reader
+   * @param aReader
+   *          the reader passed to the {@link Tokenizer} constructor
+   * @return the {@link TokenStreamComponents} for this analyzer.
+   */
+  protected abstract TokenStreamComponents createComponents(String fieldName,
+      Reader aReader);
+
+  /**
+   * This method uses {@link #createComponents(String, Reader)} to obtain an
+   * instance of {@link TokenStreamComponents}. It returns the sink of the
+   * components and stores the components internally. Subsequent calls to this
+   * method will reuse the previously stored components if and only if the
+   * {@link TokenStreamComponents#reset(Reader)} method returned
+   * <code>true</code>. Otherwise a new instance of
+   * {@link TokenStreamComponents} is created.
+   * 
+   * @param fieldName the name of the field the created TokenStream is used for
+   * @param reader the reader the streams source reads from
+   */
+  @Override
+  public final TokenStream reusableTokenStream(final String fieldName,
+      final Reader reader) throws IOException {
+    TokenStreamComponents streamChain = (TokenStreamComponents)
+    getPreviousTokenStream();
+    if (streamChain == null || !streamChain.reset(reader)) {
+      streamChain = createComponents(fieldName, reader);
+      setPreviousTokenStream(streamChain);
+    }
+    return streamChain.getTokenStream();
+  }
+
+  /**
+   * This method uses {@link #createComponents(String, Reader)} to obtain an
+   * instance of {@link TokenStreamComponents} and returns the sink of the
+   * components. Each calls to this method will create a new instance of
+   * {@link TokenStreamComponents}. Created {@link TokenStream} instances are 
+   * never reused.
+   * 
+   * @param fieldName the name of the field the created TokenStream is used for
+   * @param reader the reader the streams source reads from
+   */
+  @Override
+  public final TokenStream tokenStream(final String fieldName,
+      final Reader reader) {
+    return createComponents(fieldName, reader).getTokenStream();
+  }
+  
+  /**
+   * This class encapsulates the outer components of a token stream. It provides
+   * access to the source ({@link Tokenizer}) and the outer end (sink), an
+   * instance of {@link TokenFilter} which also serves as the
+   * {@link TokenStream} returned by
+   * {@link Analyzer#tokenStream(String, Reader)} and
+   * {@link Analyzer#reusableTokenStream(String, Reader)}.
+   */
+  public static class TokenStreamComponents {
+    final Tokenizer source;
+    final TokenStream sink;
+
+    /**
+     * Creates a new {@link TokenStreamComponents} instance.
+     * 
+     * @param source
+     *          the analyzer's tokenizer
+     * @param result
+     *          the analyzer's resulting token stream
+     */
+    public TokenStreamComponents(final Tokenizer source,
+        final TokenStream result) {
+      this.source = source;
+      this.sink = result;
+    }
+    
+    /**
+     * Creates a new {@link TokenStreamComponents} instance.
+     * 
+     * @param source
+     *          the analyzer's tokenizer
+     */
+    public TokenStreamComponents(final Tokenizer source) {
+      this.source = source;
+      this.sink = source;
+    }
+
+    /**
+     * Resets the encapsulated components with the given reader. This method by
+     * default returns <code>true</code> indicating that the components have
+     * been reset successfully. Subclasses of {@link ReusableAnalyzerBase} might use
+     * their own {@link TokenStreamComponents} returning <code>false</code> if
+     * the components cannot be reset.
+     * 
+     * @param reader
+     *          a reader to reset the source component
+     * @return <code>true</code> if the components were reset, otherwise
+     *         <code>false</code>
+     * @throws IOException
+     *           if the component's reset method throws an {@link IOException}
+     */
+    protected boolean reset(final Reader reader) throws IOException {
+      source.reset(reader);
+      if(sink != source)
+        sink.reset(); // only reset if the sink reference is different from source
+      return true;
+    }
+
+    /**
+     * Returns the sink {@link TokenStream}
+     * 
+     * @return the sink {@link TokenStream}
+     */
+    protected TokenStream getTokenStream() {
+      return sink;
+    }
+
+  }
+
+}

Propchange: lucene/java/trunk/src/java/org/apache/lucene/analysis/ReusableAnalyzerBase.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java Sun Jan  3 08:48:17 2010
@@ -18,25 +18,15 @@
  */
 
 import java.io.Reader;
-import java.io.IOException;
 
 /** An {@link Analyzer} that filters {@link LetterTokenizer} 
  *  with {@link LowerCaseFilter} */
 
-public final class SimpleAnalyzer extends Analyzer {
-  @Override
-  public TokenStream tokenStream(String fieldName, Reader reader) {
-    return new LowerCaseTokenizer(reader);
-  }
+public final class SimpleAnalyzer extends ReusableAnalyzerBase {
 
   @Override
-  public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
-    Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
-    if (tokenizer == null) {
-      tokenizer = new LowerCaseTokenizer(reader);
-      setPreviousTokenStream(tokenizer);
-    } else
-      tokenizer.reset(reader);
-    return tokenizer;
+  protected TokenStreamComponents createComponents(final String fieldName,
+      final Reader reader) {
+    return new TokenStreamComponents(new LowerCaseTokenizer(reader));
   }
 }

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/StopAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/StopAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/StopAnalyzer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/StopAnalyzer.java Sun Jan  3 08:48:17 2010
@@ -24,6 +24,7 @@
 import java.util.Set;
 import java.util.List;
 
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
 import org.apache.lucene.util.Version;
 
 /** Filters {@link LetterTokenizer} with {@link LowerCaseFilter} and {@link StopFilter}.
@@ -38,9 +39,7 @@
  * </ul>
 */
 
-public final class StopAnalyzer extends Analyzer {
-  private final Set<?> stopWords;
-  private final Version matchVersion;
+public final class StopAnalyzer extends StopwordAnalyzerBase {
   
   /** An unmodifiable set containing some common English words that are not usually useful
   for searching.*/
@@ -65,16 +64,14 @@
    * @param matchVersion See <a href="#version">above</a>
    */
   public StopAnalyzer(Version matchVersion) {
-    stopWords = ENGLISH_STOP_WORDS_SET;
-    this.matchVersion = matchVersion;
+    this(matchVersion, ENGLISH_STOP_WORDS_SET);
   }
 
   /** Builds an analyzer with the stop words from the given set.
    * @param matchVersion See <a href="#version">above</a>
    * @param stopWords Set of stop words */
   public StopAnalyzer(Version matchVersion, Set<?> stopWords) {
-    this.stopWords = stopWords;
-    this.matchVersion = matchVersion;
+    super(matchVersion, stopWords);
   }
 
   /** Builds an analyzer with the stop words from the given file.
@@ -82,8 +79,7 @@
    * @param matchVersion See <a href="#version">above</a>
    * @param stopwordsFile File to load stop words from */
   public StopAnalyzer(Version matchVersion, File stopwordsFile) throws IOException {
-    stopWords = WordlistLoader.getWordSet(stopwordsFile);
-    this.matchVersion = matchVersion;
+    this(matchVersion, WordlistLoader.getWordSet(stopwordsFile));
   }
 
   /** Builds an analyzer with the stop words from the given reader.
@@ -91,34 +87,21 @@
    * @param matchVersion See <a href="#version">above</a>
    * @param stopwords Reader to load stop words from */
   public StopAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
-    stopWords = WordlistLoader.getWordSet(stopwords);
-    this.matchVersion = matchVersion;
+    this(matchVersion, WordlistLoader.getWordSet(stopwords));
   }
 
-  /** Filters LowerCaseTokenizer with StopFilter. */
-  @Override
-  public TokenStream tokenStream(String fieldName, Reader reader) {
-    return new StopFilter(matchVersion,
-        new LowerCaseTokenizer(reader), stopWords);
-  }
-
-  /** Filters LowerCaseTokenizer with StopFilter. */
-  private class SavedStreams {
-    Tokenizer source;
-    TokenStream result;
-  };
+  /**
+   * Creates {@link TokenStreamComponents} used to tokenize all the text in the provided {@link Reader}.
+   *
+   * @return {@link TokenStreamComponents} built from a {@link LowerCaseTokenizer} filtered with
+   *         {@link StopFilter}
+   */
   @Override
-  public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
-    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
-    if (streams == null) {
-      streams = new SavedStreams();
-      streams.source = new LowerCaseTokenizer(reader);
-      streams.result = new StopFilter(matchVersion,
-          streams.source, stopWords);
-      setPreviousTokenStream(streams);
-    } else
-      streams.source.reset(reader);
-    return streams.result;
+  protected TokenStreamComponents createComponents(String fieldName,
+      Reader reader) {
+    final Tokenizer source = new LowerCaseTokenizer(reader);
+    return new TokenStreamComponents(source, new StopFilter(matchVersion,
+          source, stopwords));
   }
 }
 

Added: lucene/java/trunk/src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java?rev=895339&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java Sun Jan  3 08:48:17 2010
@@ -0,0 +1,110 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.util.Set;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.ReusableAnalyzerBase;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.util.Version;
+
+/**
+ * Base class for Analyzers that need to make use of stopword sets. 
+ * 
+ */
+public abstract class StopwordAnalyzerBase extends ReusableAnalyzerBase {
+
+  /**
+   * An immutable stopword set
+   */
+  protected final CharArraySet stopwords;
+
+  protected final Version matchVersion;
+
+  /**
+   * Returns the analyzer's stopword set or an empty set if the analyzer has no
+   * stopwords
+   * 
+   * @return the analyzer's stopword set or an empty set if the analyzer has no
+   *         stopwords
+   */
+  public Set<?> getStopwordSet() {
+    return stopwords;
+  }
+
+  /**
+   * Creates a new instance initialized with the given stopword set
+   * 
+   * @param version
+   *          the Lucene version for cross version compatibility
+   * @param stopwords
+   *          the analyzer's stopword set
+   */
+  protected StopwordAnalyzerBase(final Version version, final Set<?> stopwords) {
+    /*
+     * no need to call
+     * setOverridesTokenStreamMethod(StopwordAnalyzerBase.class); here, both
+     * tokenStream methods are final in this class.
+     */
+    matchVersion = version;
+    // analyzers should use char array set for stopwords!
+    this.stopwords = stopwords == null ? CharArraySet.EMPTY_SET : CharArraySet
+        .unmodifiableSet(CharArraySet.copy(version, stopwords));
+  }
+
+  /**
+   * Creates a new Analyzer with an empty stopword set
+   * 
+   * @param version
+   *          the Lucene version for cross version compatibility
+   */
+  protected StopwordAnalyzerBase(final Version version) {
+    this(version, null);
+  }
+
+  /**
+   * Creates a CharArraySet from a file resource associated with a class. (See
+   * {@link Class#getResourceAsStream(String)}).
+   * 
+   * @param ignoreCase
+   *          <code>true</code> if the set should ignore the case of the
+   *          stopwords, otherwise <code>false</code>
+   * @param aClass
+   *          a class that is associated with the given stopwordResource
+   * @param resource
+   *          name of the resource file associated with the given class
+   * @param comment
+   *          comment string to ignore in the stopword file
+   * @return a CharArraySet containing the distinct stopwords from the given
+   *         file
+   * @throws IOException
+   *           if loading the stopwords throws an {@link IOException}
+   */
+  protected static CharArraySet loadStopwordSet(final boolean ignoreCase,
+      final Class<? extends ReusableAnalyzerBase> aClass, final String resource,
+      final String comment) throws IOException {
+    final Set<String> wordSet = WordlistLoader.getWordSet(aClass, resource,
+        comment);
+    final CharArraySet set = new CharArraySet(Version.LUCENE_31, wordSet.size(), ignoreCase);
+    set.addAll(wordSet);
+    return set;
+  }
+
+}

Propchange: lucene/java/trunk/src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java Sun Jan  3 08:48:17 2010
@@ -18,24 +18,14 @@
  */
 
 import java.io.Reader;
-import java.io.IOException;
 
 /** An Analyzer that uses {@link WhitespaceTokenizer}. */
 
-public final class WhitespaceAnalyzer extends Analyzer {
-  @Override
-  public TokenStream tokenStream(String fieldName, Reader reader) {
-    return new WhitespaceTokenizer(reader);
-  }
+public final class WhitespaceAnalyzer extends ReusableAnalyzerBase {
 
   @Override
-  public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
-    Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
-    if (tokenizer == null) {
-      tokenizer = new WhitespaceTokenizer(reader);
-      setPreviousTokenStream(tokenizer);
-    } else
-      tokenizer.reset(reader);
-    return tokenizer;
+  protected TokenStreamComponents createComponents(final String fieldName,
+      final Reader reader) {
+    return new TokenStreamComponents(new WhitespaceTokenizer(reader));
   }
 }



Mime
View raw message