lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From uschind...@apache.org
Subject svn commit: r899359 [2/7] - in /lucene/java/branches/flex_1458: ./ contrib/ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/ contrib/analyzers/common/src/java/org/apache/l...
Date Thu, 14 Jan 2010 19:05:42 GMT
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java?rev=899359&r1=899358&r2=899359&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java Thu Jan 14 19:05:12 2010
@@ -46,6 +46,7 @@
    * @param onlyLongestMatch Add only the longest matching subword to the stream
    * @deprecated use {@link #DictionaryCompoundWordTokenFilter(Version, TokenStream, String[], int, int, int, boolean)} instead
    */
+  @Deprecated
   public DictionaryCompoundWordTokenFilter(TokenStream input, String[] dictionary,
       int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
     super(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
@@ -58,6 +59,7 @@
    * @param dictionary the word dictionary to match against
    * @deprecated use {@link #DictionaryCompoundWordTokenFilter(Version, TokenStream, String[])} instead 
    */
+  @Deprecated
   public DictionaryCompoundWordTokenFilter(TokenStream input, String[] dictionary) {
     super(Version.LUCENE_30, input, dictionary);
   }
@@ -70,6 +72,7 @@
    *        lower case strings.
    * @deprecated use {@link #DictionaryCompoundWordTokenFilter(Version, TokenStream, Set)} instead 
    */
+  @Deprecated
   public DictionaryCompoundWordTokenFilter(TokenStream input, Set dictionary) {
     super(Version.LUCENE_30, input, dictionary);
   }
@@ -86,6 +89,7 @@
    * @param onlyLongestMatch Add only the longest matching subword to the stream
    * @deprecated use {@link #DictionaryCompoundWordTokenFilter(Version, TokenStream, Set, int, int, int, boolean)} instead
    */
+  @Deprecated
   public DictionaryCompoundWordTokenFilter(TokenStream input, Set dictionary,
       int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
     super(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java?rev=899359&r1=899358&r2=899359&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java Thu Jan 14 19:05:12 2010
@@ -167,6 +167,7 @@
    * @param onlyLongestMatch Add only the longest matching subword to the stream
    * @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, String[], int, int, int, boolean)} instead. 
    */
+  @Deprecated
   public HyphenationCompoundWordTokenFilter(TokenStream input,
       HyphenationTree hyphenator, String[] dictionary, int minWordSize,
       int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
@@ -182,6 +183,7 @@
    * @param dictionary the word dictionary to match against
    * @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, String[])} instead.
    */
+  @Deprecated
   public HyphenationCompoundWordTokenFilter(TokenStream input,
       HyphenationTree hyphenator, String[] dictionary) {
     this(Version.LUCENE_30, input, hyphenator, makeDictionary(dictionary), DEFAULT_MIN_WORD_SIZE,
@@ -197,6 +199,7 @@
    *        lower case strings. 
    * @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set)} instead.        
    */
+  @Deprecated
   public HyphenationCompoundWordTokenFilter(TokenStream input,
       HyphenationTree hyphenator, Set dictionary) {
     this(Version.LUCENE_30, input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
@@ -218,6 +221,7 @@
    * @param onlyLongestMatch Add only the longest matching subword to the stream
    * @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set, int, int, int, boolean)} instead.
    */
+  @Deprecated
   public HyphenationCompoundWordTokenFilter(TokenStream input,
       HyphenationTree hyphenator, Set dictionary, int minWordSize,
       int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java?rev=899359&r1=899358&r2=899359&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java Thu Jan 14 19:05:12 2010
@@ -17,6 +17,8 @@
  * limitations under the License.
  */
 
+import org.apache.lucene.analysis.ReusableAnalyzerBase;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.LowerCaseFilter;
@@ -30,9 +32,9 @@
 
 import java.io.*;
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.Set;
-import java.util.Collections;
 
 /**
  * {@link Analyzer} for Czech language.
@@ -53,13 +55,14 @@
  * <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
  * </ul>
  */
-public final class CzechAnalyzer extends Analyzer {
+public final class CzechAnalyzer extends ReusableAnalyzerBase {
 
   /**
 	 * List of typical stopwords.
 	 * @deprecated use {@link #getDefaultStopSet()} instead
 	 */
   // TODO make this private in 3.1
+	@Deprecated
 	public final static String[] CZECH_STOP_WORDS = {
         "a","s","k","o","i","u","v","z","dnes","cz","t\u00edmto","bude\u0161","budem",
         "byli","jse\u0161","m\u016fj","sv\u00fdm","ta","tomto","tohle","tuto","tyto",
@@ -95,10 +98,11 @@
 	      Version.LUCENE_CURRENT, Arrays.asList(CZECH_STOP_WORDS), false));
 	}
 
+ 
   /**
    * Contains the stopwords used with the {@link StopFilter}.
    */
-	// TODO make this final in 3.1
+	// TODO once loadStopWords is gone those member should be removed too in favor of StopwordAnalyzerBase
 	private Set<?> stoptable;
   private final Version matchVersion;
 
@@ -133,6 +137,7 @@
    * @param stopwords a stopword set
    * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
    */
+  @Deprecated
   public CzechAnalyzer(Version matchVersion, String... stopwords) {
     this(matchVersion, StopFilter.makeStopSet( matchVersion, stopwords ));
 	}
@@ -145,6 +150,7 @@
    * @param stopwords a stopword set
    * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
    */
+  @Deprecated
   public CzechAnalyzer(Version matchVersion, HashSet<?> stopwords) {
     this(matchVersion, (Set<?>)stopwords);
 	}
@@ -157,6 +163,7 @@
    * @param stopwords a file containing stopwords
    * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
    */
+  @Deprecated
   public CzechAnalyzer(Version matchVersion, File stopwords ) throws IOException {
     this(matchVersion, (Set<?>)WordlistLoader.getWordSet( stopwords ));
 	}
@@ -168,6 +175,8 @@
      * @deprecated use {@link WordlistLoader#getWordSet(Reader, String) }
      *             and {@link #CzechAnalyzer(Version, Set)} instead
      */
+    // TODO extend StopwordAnalyzerBase once this method is gone!
+    @Deprecated
     public void loadStopWords( InputStream wordfile, String encoding ) {
         setPreviousTokenStream(null); // force a new stopfilter to be created
         if ( wordfile == null ) {
@@ -191,58 +200,25 @@
           stoptable = Collections.emptySet();
         }
     }
-
   /**
-   * Creates a {@link TokenStream} which tokenizes all the text in the provided
+   * Creates {@link TokenStreamComponents} used to tokenize all the text in the provided
    * {@link Reader}.
    * 
-   * @return A {@link TokenStream} built from a {@link StandardTokenizer}
+   * @return {@link TokenStreamComponents} built from a {@link StandardTokenizer}
    *         filtered with {@link StandardFilter}, {@link LowerCaseFilter},
    *         {@link StopFilter}, and {@link CzechStemFilter} (only if version is
    *         >= LUCENE_31)
    */
   @Override
-	public final TokenStream tokenStream( String fieldName, Reader reader ) {
-                TokenStream result = new StandardTokenizer( matchVersion, reader );
-		result = new StandardFilter( result );
-		result = new LowerCaseFilter( matchVersion, result );
-		result = new StopFilter( matchVersion, result, stoptable );
-		if (matchVersion.onOrAfter(Version.LUCENE_31))
-		  result = new CzechStemFilter(result);
-		return result;
-	}
-	
-	private class SavedStreams {
-	    Tokenizer source;
-	    TokenStream result;
-	};
-	
-  /**
-   * Returns a (possibly reused) {@link TokenStream} which tokenizes all the
-   * text in the provided {@link Reader}.
-   * 
-   * @return A {@link TokenStream} built from a {@link StandardTokenizer}
-   *         filtered with {@link StandardFilter}, {@link LowerCaseFilter},
-   *         {@link StopFilter}, and {@link CzechStemFilter} (only if version is
-   *         >= LUCENE_31)
-   */
-	@Override
-	public TokenStream reusableTokenStream(String fieldName, Reader reader)
-      throws IOException {
-      SavedStreams streams = (SavedStreams) getPreviousTokenStream();
-      if (streams == null) {
-        streams = new SavedStreams();
-        streams.source = new StandardTokenizer(matchVersion, reader);
-        streams.result = new StandardFilter(streams.source);
-        streams.result = new LowerCaseFilter(matchVersion, streams.result);
-        streams.result = new StopFilter( matchVersion, streams.result, stoptable);
-        if (matchVersion.onOrAfter(Version.LUCENE_31))
-          streams.result = new CzechStemFilter(streams.result);
-        setPreviousTokenStream(streams);
-      } else {
-        streams.source.reset(reader);
-      }
-      return streams.result;
-    }
+  protected TokenStreamComponents createComponents(String fieldName,
+      Reader reader) {
+    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+    TokenStream result = new StandardFilter(source);
+    result = new LowerCaseFilter(matchVersion, result);
+    result = new StopFilter( matchVersion, result, stoptable);
+    if (matchVersion.onOrAfter(Version.LUCENE_31))
+      result = new CzechStemFilter(result);
+    return new TokenStreamComponents(source, result);
+  }
 }
 

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java?rev=899359&r1=899358&r2=899359&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java Thu Jan 14 19:05:12 2010
@@ -29,13 +29,15 @@
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
 import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.standard.StandardFilter;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;  // for javadoc
 import org.apache.lucene.util.Version;
 
 /**
@@ -51,13 +53,14 @@
  * <p><b>NOTE</b>: This class uses the same {@link Version}
  * dependent settings as {@link StandardAnalyzer}.</p>
  */
-public final class GermanAnalyzer extends Analyzer {
+public final class GermanAnalyzer extends StopwordAnalyzerBase {
   
   /**
    * List of typical german stopwords.
    * @deprecated use {@link #getDefaultStopSet()} instead
    */
   //TODO make this private in 3.1
+  @Deprecated
   public final static String[] GERMAN_STOP_WORDS = {
     "einer", "eine", "eines", "einem", "einen",
     "der", "die", "das", "dass", "daß",
@@ -89,17 +92,13 @@
   /**
    * Contains the stopwords used with the {@link StopFilter}.
    */
-  //TODO make this final in 3.1
-  private Set<?> stopSet;
-
+ 
   /**
    * Contains words that should be indexed but not stemmed.
    */
   // TODO make this final in 3.1
   private Set<?> exclusionSet;
 
-  private final Version matchVersion;
-
   /**
    * Builds an analyzer with the default stop words:
    * {@link #getDefaultStopSet()}.
@@ -131,15 +130,15 @@
    *          a stemming exclusion set
    */
   public GermanAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
-    stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
+    super(matchVersion, stopwords);
     exclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
-    this.matchVersion = matchVersion;
   }
 
   /**
    * Builds an analyzer with the given stop words.
    * @deprecated use {@link #GermanAnalyzer(Version, Set)}
    */
+  @Deprecated
   public GermanAnalyzer(Version matchVersion, String... stopwords) {
     this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords));
   }
@@ -148,6 +147,7 @@
    * Builds an analyzer with the given stop words.
    * @deprecated use {@link #GermanAnalyzer(Version, Set)}
    */
+  @Deprecated
   public GermanAnalyzer(Version matchVersion, Map<?,?> stopwords) {
     this(matchVersion, stopwords.keySet());
     
@@ -157,6 +157,7 @@
    * Builds an analyzer with the given stop words.
    * @deprecated use {@link #GermanAnalyzer(Version, Set)}
    */
+  @Deprecated
   public GermanAnalyzer(Version matchVersion, File stopwords) throws IOException {
     this(matchVersion, WordlistLoader.getWordSet(stopwords));
   }
@@ -165,6 +166,7 @@
    * Builds an exclusionlist from an array of Strings.
    * @deprecated use {@link #GermanAnalyzer(Version, Set, Set)} instead
    */
+  @Deprecated
   public void setStemExclusionTable(String[] exclusionlist) {
     exclusionSet = StopFilter.makeStopSet(matchVersion, exclusionlist);
     setPreviousTokenStream(null); // force a new stemmer to be created
@@ -174,6 +176,7 @@
    * Builds an exclusionlist from a {@link Map}
    * @deprecated use {@link #GermanAnalyzer(Version, Set, Set)} instead
    */
+  @Deprecated
   public void setStemExclusionTable(Map<?,?> exclusionlist) {
     exclusionSet = new HashSet<Object>(exclusionlist.keySet());
     setPreviousTokenStream(null); // force a new stemmer to be created
@@ -183,55 +186,28 @@
    * Builds an exclusionlist from the words contained in the given file.
    * @deprecated use {@link #GermanAnalyzer(Version, Set, Set)} instead
    */
+  @Deprecated
   public void setStemExclusionTable(File exclusionlist) throws IOException {
     exclusionSet = WordlistLoader.getWordSet(exclusionlist);
     setPreviousTokenStream(null); // force a new stemmer to be created
   }
-
-  /**
-   * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
-   *
-   * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
-   *         {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}, and
-   *         {@link GermanStemFilter}
-   */
-  @Override
-  public TokenStream tokenStream(String fieldName, Reader reader) {
-    TokenStream result = new StandardTokenizer(matchVersion, reader);
-    result = new StandardFilter(result);
-    result = new LowerCaseFilter(matchVersion, result);
-    result = new StopFilter( matchVersion, result, stopSet);
-    result = new GermanStemFilter(result, exclusionSet);
-    return result;
-  }
-  
-  private class SavedStreams {
-    Tokenizer source;
-    TokenStream result;
-  };
   
   /**
-   * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text 
-   * in the provided {@link Reader}.
-   *
-   * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
-   *         {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}, and
+   * Creates {@link TokenStreamComponents} used to tokenize all the text in the
+   * provided {@link Reader}.
+   * 
+   * @return {@link TokenStreamComponents} built from a
+   *         {@link StandardTokenizer} filtered with {@link StandardFilter},
+   *         {@link LowerCaseFilter}, {@link StopFilter}, and
    *         {@link GermanStemFilter}
    */
   @Override
-  public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
-    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
-    if (streams == null) {
-      streams = new SavedStreams();
-      streams.source = new StandardTokenizer(matchVersion, reader);
-      streams.result = new StandardFilter(streams.source);
-      streams.result = new LowerCaseFilter(matchVersion, streams.result);
-      streams.result = new StopFilter( matchVersion, streams.result, stopSet);
-      streams.result = new GermanStemFilter(streams.result, exclusionSet);
-      setPreviousTokenStream(streams);
-    } else {
-      streams.source.reset(reader);
-    }
-    return streams.result;
+  protected TokenStreamComponents createComponents(String fieldName,
+      Reader reader) {
+    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+    TokenStream result = new StandardFilter(source);
+    result = new LowerCaseFilter(matchVersion, result);
+    result = new StopFilter( matchVersion, result, stopwords);
+    return new TokenStreamComponents(source, new GermanStemFilter(result, exclusionSet));
   }
 }

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java?rev=899359&r1=899358&r2=899359&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java Thu Jan 14 19:05:12 2010
@@ -19,14 +19,15 @@
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
 import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;  // for javadoc
 import org.apache.lucene.util.Version;
 
-import java.io.IOException;
 import java.io.Reader;
 import java.util.Arrays;
 import java.util.Map;
@@ -43,7 +44,7 @@
  * <p><b>NOTE</b>: This class uses the same {@link Version}
  * dependent settings as {@link StandardAnalyzer}.</p>
  */
-public final class GreekAnalyzer extends Analyzer
+public final class GreekAnalyzer extends StopwordAnalyzerBase
 {
     /**
      * List of typical Greek stopwords.
@@ -73,13 +74,6 @@
           Version.LUCENE_CURRENT, Arrays.asList(GREEK_STOP_WORDS), false));
     }
 
-    /**
-     * Contains the stopwords used with the {@link StopFilter}.
-     */
-    private final Set<?> stopSet;
-
-    private final Version matchVersion;
-
     public GreekAnalyzer(Version matchVersion) {
       this(matchVersion, DefaultSetHolder.DEFAULT_SET);
     }
@@ -93,8 +87,7 @@
      *          a stopword set
      */
     public GreekAnalyzer(Version matchVersion, Set<?> stopwords) {
-      stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
-      this.matchVersion = matchVersion;
+      super(matchVersion, stopwords);
     }
 
     /**
@@ -102,6 +95,7 @@
      * @param stopwords Array of stopwords to use.
      * @deprecated use {@link #GreekAnalyzer(Version, Set)} instead
      */
+    @Deprecated
     public GreekAnalyzer(Version matchVersion, String... stopwords)
     {
       this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords));
@@ -111,51 +105,25 @@
      * Builds an analyzer with the given stop words.
      * @deprecated use {@link #GreekAnalyzer(Version, Set)} instead
      */
+    @Deprecated
     public GreekAnalyzer(Version matchVersion, Map<?,?> stopwords)
     {
       this(matchVersion, stopwords.keySet());
     }
-
-    /**
-     * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
-     *
-     * @return  A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
-     *                  {@link GreekLowerCaseFilter} and {@link StopFilter}
-     */
-    @Override
-    public TokenStream tokenStream(String fieldName, Reader reader)
-    {
-        TokenStream result = new StandardTokenizer(matchVersion, reader);
-        result = new GreekLowerCaseFilter(result);
-        result = new StopFilter(matchVersion, result, stopSet);
-        return result;
-    }
-    
-    private class SavedStreams {
-      Tokenizer source;
-      TokenStream result;
-    };
-    
-    /**
-     * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text 
-     * in the provided {@link Reader}.
-     *
-     * @return  A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
-     *                  {@link GreekLowerCaseFilter} and {@link StopFilter}
-     */
+  
+   /**
+    * Creates {@link TokenStreamComponents} used to tokenize all the text in the
+    * provided {@link Reader}.
+    * 
+    * @return {@link TokenStreamComponents} built from a
+    *         {@link StandardTokenizer} filtered with
+    *         {@link GreekLowerCaseFilter} and {@link StopFilter}
+    */
     @Override
-    public TokenStream reusableTokenStream(String fieldName, Reader reader) 
-      throws IOException {
-      SavedStreams streams = (SavedStreams) getPreviousTokenStream();
-      if (streams == null) {
-        streams = new SavedStreams();
-        streams.source = new StandardTokenizer(matchVersion, reader);
-        streams.result = new GreekLowerCaseFilter(streams.source);
-        streams.result = new StopFilter(matchVersion, streams.result, stopSet);
-        setPreviousTokenStream(streams);
-      } else {
-        streams.source.reset(reader);
-      }
-      return streams.result;
+    protected TokenStreamComponents createComponents(String fieldName,
+        Reader reader) {
+      final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+      final TokenStream result = new GreekLowerCaseFilter(source);
+      return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
     }
 }

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java?rev=899359&r1=899358&r2=899359&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java Thu Jan 14 19:05:12 2010
@@ -19,17 +19,15 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
 import java.io.Reader;
-import java.util.Collections;
 import java.util.Hashtable;
 import java.util.Set;
 
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
 import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.WordlistLoader;
@@ -45,7 +43,7 @@
  * yeh and keheh) are standardized. "Stemming" is accomplished via stopwords.
  * </p>
  */
-public final class PersianAnalyzer extends Analyzer {
+public final class PersianAnalyzer extends StopwordAnalyzerBase {
 
   /**
    * File containing default Persian stopwords.
@@ -58,11 +56,6 @@
   public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
 
   /**
-   * Contains the stopwords used with the StopFilter.
-   */
-  private final Set<?> stoptable;
-
-  /**
    * The comment character in the stopwords file. All lines prefixed with this
    * will be ignored
    */
@@ -85,30 +78,15 @@
 
     static {
       try {
-        DEFAULT_STOP_SET = loadDefaultStopWordSet();
+        DEFAULT_STOP_SET = loadStopwordSet(false, PersianAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
       } catch (IOException ex) {
         // default set should always be present as it is part of the
         // distribution (JAR)
         throw new RuntimeException("Unable to load default stopword set");
       }
     }
-
-    static Set<String> loadDefaultStopWordSet() throws IOException {
-      InputStream stream = PersianAnalyzer.class
-          .getResourceAsStream(DEFAULT_STOPWORD_FILE);
-      try {
-        InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
-        // make sure it is unmodifiable as we expose it in the outer class
-        return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader,
-            STOPWORDS_COMMENT));
-      } finally {
-        stream.close();
-      }
-    }
   }
 
-  private final Version matchVersion;
-
   /**
    * Builds an analyzer with the default stop words:
    * {@link #DEFAULT_STOPWORD_FILE}.
@@ -126,14 +104,14 @@
    *          a stopword set
    */
   public PersianAnalyzer(Version matchVersion, Set<?> stopwords){
-    stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
-    this.matchVersion = matchVersion;
+    super(matchVersion, stopwords);
   }
 
   /**
    * Builds an analyzer with the given stop words.
    * @deprecated use {@link #PersianAnalyzer(Version, Set)} instead
    */
+  @Deprecated
   public PersianAnalyzer(Version matchVersion, String... stopwords) {
     this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords));
   }
@@ -142,6 +120,7 @@
    * Builds an analyzer with the given stop words.
    * @deprecated use {@link #PersianAnalyzer(Version, Set)} instead
    */
+  @Deprecated
   public PersianAnalyzer(Version matchVersion, Hashtable<?, ?> stopwords) {
     this(matchVersion, stopwords.keySet());
   }
@@ -151,23 +130,25 @@
    * using {@link #STOPWORDS_COMMENT}
    * @deprecated use {@link #PersianAnalyzer(Version, Set)} instead
    */
+  @Deprecated
   public PersianAnalyzer(Version matchVersion, File stopwords) throws IOException {
     this(matchVersion, WordlistLoader.getWordSet(stopwords, STOPWORDS_COMMENT));
   }
 
   /**
-   * Creates a {@link TokenStream} which tokenizes all the text in the provided
+   * Creates {@link TokenStreamComponents} used to tokenize all the text in the provided
    * {@link Reader}.
    * 
-   * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
+   * @return {@link TokenStreamComponents} built from a {@link ArabicLetterTokenizer}
    *         filtered with {@link LowerCaseFilter}, 
    *         {@link ArabicNormalizationFilter},
    *         {@link PersianNormalizationFilter} and Persian Stop words
    */
   @Override
-  public TokenStream tokenStream(String fieldName, Reader reader) {
-    TokenStream result = new ArabicLetterTokenizer(reader);
-    result = new LowerCaseFilter(matchVersion, result);
+  protected TokenStreamComponents createComponents(String fieldName,
+      Reader reader) {
+    final Tokenizer source = new ArabicLetterTokenizer(reader);
+    TokenStream result = new LowerCaseFilter(matchVersion, source);
     result = new ArabicNormalizationFilter(result);
     /* additional persian-specific normalization */
     result = new PersianNormalizationFilter(result);
@@ -175,44 +156,6 @@
      * the order here is important: the stopword list is normalized with the
      * above!
      */
-    result = new StopFilter(matchVersion, result, stoptable);
-    return result;
-  }
-  
-  private class SavedStreams {
-    Tokenizer source;
-    TokenStream result;
-  }
-
-  /**
-   * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text 
-   * in the provided {@link Reader}.
-   * 
-   * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
-   *         filtered with {@link LowerCaseFilter}, 
-   *         {@link ArabicNormalizationFilter},
-   *         {@link PersianNormalizationFilter} and Persian Stop words
-   */
-  @Override
-  public TokenStream reusableTokenStream(String fieldName, Reader reader)
-      throws IOException {
-    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
-    if (streams == null) {
-      streams = new SavedStreams();
-      streams.source = new ArabicLetterTokenizer(reader);
-      streams.result = new LowerCaseFilter(matchVersion, streams.source);
-      streams.result = new ArabicNormalizationFilter(streams.result);
-      /* additional persian-specific normalization */
-      streams.result = new PersianNormalizationFilter(streams.result);
-      /*
-       * the order here is important: the stopword list is normalized with the
-       * above!
-       */
-      streams.result = new StopFilter(matchVersion, streams.result, stoptable);
-      setPreviousTokenStream(streams);
-    } else {
-      streams.source.reset(reader);
-    }
-    return streams.result;
+    return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
   }
 }

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java?rev=899359&r1=899358&r2=899359&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java Thu Jan 14 19:05:12 2010
@@ -50,6 +50,7 @@
    * @param articles a set of articles
    * @deprecated use {@link #ElisionFilter(Version, TokenStream, Set)} instead
    */
+  @Deprecated
   public void setArticles(Version matchVersion, Set<?> articles) {
     this.articles = CharArraySet.unmodifiableSet(
         CharArraySet.copy(matchVersion, articles));
@@ -60,6 +61,7 @@
    * @param articles a set of articles
    * @deprecated use {@link #setArticles(Version, Set)} instead
    */
+  @Deprecated
   public void setArticles(Set<?> articles) {
     setArticles(Version.LUCENE_CURRENT, articles);
   }
@@ -74,6 +76,7 @@
    * Constructs an elision filter with standard stop words
    * @deprecated use {@link #ElisionFilter(Version, TokenStream)} instead
    */
+  @Deprecated
   protected ElisionFilter(TokenStream input) {
     this(Version.LUCENE_30, input);
   }
@@ -82,6 +85,7 @@
    * Constructs an elision filter with a Set of stop words
    * @deprecated use {@link #ElisionFilter(Version, TokenStream, Set)} instead
    */
+  @Deprecated
   public ElisionFilter(TokenStream input, Set<?> articles) {
     this(Version.LUCENE_30, input, articles);
   }
@@ -103,6 +107,7 @@
    * Constructs an elision filter with an array of stop words
    * @deprecated use {@link #ElisionFilter(Version, TokenStream, Set)} instead
    */
+  @Deprecated
   public ElisionFilter(TokenStream input, String[] articles) {
     this(Version.LUCENE_CURRENT, input,
         new CharArraySet(Version.LUCENE_CURRENT,

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java?rev=899359&r1=899358&r2=899359&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java Thu Jan 14 19:05:12 2010
@@ -20,7 +20,9 @@
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
 import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.WordlistLoader;
@@ -59,13 +61,14 @@
  * <p><b>NOTE</b>: This class uses the same {@link Version}
  * dependent settings as {@link StandardAnalyzer}.</p>
  */
-public final class FrenchAnalyzer extends Analyzer {
+public final class FrenchAnalyzer extends StopwordAnalyzerBase {
 
   /**
    * Extended list of typical French stopwords.
    * @deprecated use {@link #getDefaultStopSet()} instead
    */
   // TODO make this private in 3.1
+  @Deprecated
   public final static String[] FRENCH_STOP_WORDS = {
     "a", "afin", "ai", "ainsi", "après", "attendu", "au", "aujourd", "auquel", "aussi",
     "autre", "autres", "aux", "auxquelles", "auxquels", "avait", "avant", "avec", "avoir",
@@ -92,17 +95,11 @@
   };
 
   /**
-   * Contains the stopwords used with the {@link StopFilter}.
-   */
-  private final Set<?> stoptable;
-  /**
    * Contains words that should be indexed but not stemmed.
    */
   //TODO make this final in 3.0
   private Set<?> excltable = Collections.<Object>emptySet();
 
-  private final Version matchVersion;
-  
   /**
    * Returns an unmodifiable instance of the default stop-words set.
    * @return an unmodifiable instance of the default stop-words set.
@@ -148,9 +145,7 @@
    */
   public FrenchAnalyzer(Version matchVersion, Set<?> stopwords,
       Set<?> stemExclutionSet) {
-    this.matchVersion = matchVersion;
-    this.stoptable = CharArraySet.unmodifiableSet(CharArraySet
-        .copy(matchVersion, stopwords));
+    super(matchVersion, stopwords);
     this.excltable = CharArraySet.unmodifiableSet(CharArraySet
         .copy(matchVersion, stemExclutionSet));
   }
@@ -160,6 +155,7 @@
    * Builds an analyzer with the given stop words.
    * @deprecated use {@link #FrenchAnalyzer(Version, Set)} instead
    */
+  @Deprecated
   public FrenchAnalyzer(Version matchVersion, String... stopwords) {
     this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords));
   }
@@ -169,6 +165,7 @@
    * @throws IOException
    * @deprecated use {@link #FrenchAnalyzer(Version, Set)} instead
    */
+  @Deprecated
   public FrenchAnalyzer(Version matchVersion, File stopwords) throws IOException {
     this(matchVersion, WordlistLoader.getWordSet(stopwords));
   }
@@ -177,6 +174,7 @@
    * Builds an exclusionlist from an array of Strings.
    * @deprecated use {@link #FrenchAnalyzer(Version, Set, Set)} instead
    */
+  @Deprecated
   public void setStemExclusionTable(String... exclusionlist) {
     excltable = StopFilter.makeStopSet(matchVersion, exclusionlist);
     setPreviousTokenStream(null); // force a new stemmer to be created
@@ -186,6 +184,7 @@
    * Builds an exclusionlist from a Map.
    * @deprecated use {@link #FrenchAnalyzer(Version, Set, Set)} instead
    */
+  @Deprecated
   public void setStemExclusionTable(Map<?,?> exclusionlist) {
     excltable = new HashSet<Object>(exclusionlist.keySet());
     setPreviousTokenStream(null); // force a new stemmer to be created
@@ -196,60 +195,29 @@
    * @throws IOException
    * @deprecated use {@link #FrenchAnalyzer(Version, Set, Set)} instead
    */
+  @Deprecated
   public void setStemExclusionTable(File exclusionlist) throws IOException {
     excltable = new HashSet<Object>(WordlistLoader.getWordSet(exclusionlist));
     setPreviousTokenStream(null); // force a new stemmer to be created
   }
 
   /**
-   * Creates a {@link TokenStream} which tokenizes all the text in the provided
+   * Creates {@link TokenStreamComponents} used to tokenize all the text in the provided
    * {@link Reader}.
    *
-   * @return A {@link TokenStream} built from a {@link StandardTokenizer} 
+   * @return {@link TokenStreamComponents} built from a {@link StandardTokenizer} 
    *         filtered with {@link StandardFilter}, {@link StopFilter}, 
    *         {@link FrenchStemFilter} and {@link LowerCaseFilter}
    */
   @Override
-  public final TokenStream tokenStream(String fieldName, Reader reader) {
-    TokenStream result = new StandardTokenizer(matchVersion, reader);
-    result = new StandardFilter(result);
-    result = new StopFilter(matchVersion, result, stoptable);
+  protected TokenStreamComponents createComponents(String fieldName,
+      Reader reader) {
+    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+    TokenStream result = new StandardFilter(source);
+    result = new StopFilter(matchVersion, result, stopwords);
     result = new FrenchStemFilter(result, excltable);
     // Convert to lowercase after stemming!
-    result = new LowerCaseFilter(matchVersion, result);
-    return result;
-  }
-  
-  private class SavedStreams {
-    Tokenizer source;
-    TokenStream result;
-  };
-  
-  /**
-   * Returns a (possibly reused) {@link TokenStream} which tokenizes all the 
-   * text in the provided {@link Reader}.
-   *
-   * @return A {@link TokenStream} built from a {@link StandardTokenizer} 
-   *         filtered with {@link StandardFilter}, {@link StopFilter}, 
-   *         {@link FrenchStemFilter} and {@link LowerCaseFilter}
-   */
-  @Override
-  public TokenStream reusableTokenStream(String fieldName, Reader reader)
-      throws IOException {
-    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
-    if (streams == null) {
-      streams = new SavedStreams();
-      streams.source = new StandardTokenizer(matchVersion, reader);
-      streams.result = new StandardFilter(streams.source);
-      streams.result = new StopFilter(matchVersion, streams.result, stoptable);
-      streams.result = new FrenchStemFilter(streams.result, excltable);
-      // Convert to lowercase after stemming!
-      streams.result = new LowerCaseFilter(matchVersion, streams.result);
-      setPreviousTokenStream(streams);
-    } else {
-      streams.source.reset(reader);
-    }
-    return streams.result;
+    return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result));
   }
 }
 

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java?rev=899359&r1=899358&r2=899359&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java Thu Jan 14 19:05:12 2010
@@ -64,22 +64,15 @@
     }
   }
 
-  private int minGram;
-  private int maxGram;
+  private final int minGram;
+  private final int maxGram;
   private Side side;
   private char[] curTermBuffer;
   private int curTermLength;
   private int curGramSize;
   
-  private TermAttribute termAtt;
-  private OffsetAttribute offsetAtt;
-
-
-  protected EdgeNGramTokenFilter(TokenStream input) {
-    super(input);
-    this.termAtt = addAttribute(TermAttribute.class);
-    this.offsetAtt = addAttribute(OffsetAttribute.class);
-  }
+  private final TermAttribute termAtt;
+  private final OffsetAttribute offsetAtt;
 
   /**
    * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java?rev=899359&r1=899358&r2=899359&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java Thu Jan 14 19:05:12 2010
@@ -57,6 +57,7 @@
    * List of typical Dutch stopwords.
    * @deprecated use {@link #getDefaultStopSet()} instead
    */
+  @Deprecated
   public final static String[] DUTCH_STOP_WORDS =
       {
         "de", "en", "van", "ik", "te", "dat", "die", "in", "een",
@@ -128,6 +129,7 @@
    * @param stopwords
    * @deprecated use {@link #DutchAnalyzer(Version, Set)} instead
    */
+  @Deprecated
   public DutchAnalyzer(Version matchVersion, String... stopwords) {
     this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords));
   }
@@ -138,6 +140,7 @@
    * @param stopwords
    * @deprecated use {@link #DutchAnalyzer(Version, Set)} instead
    */
+  @Deprecated
   public DutchAnalyzer(Version matchVersion, HashSet<?> stopwords) {
     this(matchVersion, (Set<?>)stopwords);
   }
@@ -148,6 +151,7 @@
    * @param stopwords
    * @deprecated use {@link #DutchAnalyzer(Version, Set)} instead
    */
+  @Deprecated
   public DutchAnalyzer(Version matchVersion, File stopwords) {
     // this is completely broken!
     try {
@@ -165,6 +169,7 @@
    * @param exclusionlist
    * @deprecated use {@link #DutchAnalyzer(Version, Set, Set)} instead
    */
+  @Deprecated
   public void setStemExclusionTable(String... exclusionlist) {
     excltable = StopFilter.makeStopSet(matchVersion, exclusionlist);
     setPreviousTokenStream(null); // force a new stemmer to be created
@@ -174,6 +179,7 @@
    * Builds an exclusionlist from a Hashtable.
    * @deprecated use {@link #DutchAnalyzer(Version, Set, Set)} instead
    */
+  @Deprecated
   public void setStemExclusionTable(HashSet<?> exclusionlist) {
     excltable = exclusionlist;
     setPreviousTokenStream(null); // force a new stemmer to be created
@@ -183,6 +189,7 @@
    * Builds an exclusionlist from the words contained in the given file.
    * @deprecated use {@link #DutchAnalyzer(Version, Set, Set)} instead
    */
+  @Deprecated
   public void setStemExclusionTable(File exclusionlist) {
     try {
       excltable = org.apache.lucene.analysis.WordlistLoader.getWordSet(exclusionlist);

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.java?rev=899359&r1=899358&r2=899359&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.java Thu Jan 14 19:05:12 2010
@@ -38,17 +38,10 @@
  */
 public final class DelimitedPayloadTokenFilter extends TokenFilter {
   public static final char DEFAULT_DELIMITER = '|';
-  protected char delimiter = DEFAULT_DELIMITER;
-  protected TermAttribute termAtt;
-  protected PayloadAttribute payAtt;
-  protected PayloadEncoder encoder;
-
-  /**
-   * Construct a token stream filtering the given input.
-   */
-  protected DelimitedPayloadTokenFilter(TokenStream input) {
-    this(input, DEFAULT_DELIMITER, new IdentityEncoder());
-  }
+  private final char delimiter;
+  private final TermAttribute termAtt;
+  private final PayloadAttribute payAtt;
+  private final PayloadEncoder encoder;
 
 
   public DelimitedPayloadTokenFilter(TokenStream input, char delimiter, PayloadEncoder encoder) {
@@ -61,26 +54,19 @@
 
   @Override
   public boolean incrementToken() throws IOException {
-    boolean result = false;
     if (input.incrementToken()) {
       final char[] buffer = termAtt.termBuffer();
       final int length = termAtt.termLength();
-      //look for the delimiter
-      boolean seen = false;
       for (int i = 0; i < length; i++) {
         if (buffer[i] == delimiter) {
-          termAtt.setTermBuffer(buffer, 0, i);
           payAtt.setPayload(encoder.encode(buffer, i + 1, (length - (i + 1))));
-          seen = true;
-          break;//at this point, we know the whole piece, so we can exit.  If we don't see the delimiter, then the termAtt is the same
+          termAtt.setTermLength(i); // simply set a new length
+          return true;
         }
       }
-      if (seen == false) {
-        //no delimiter
-        payAtt.setPayload(null);
-      }
-      result = true;
-    }
-    return result;
+      // we have not seen the delimiter
+      payAtt.setPayload(null);
+      return true;
+    } else return false;
   }
 }

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/IdentityEncoder.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/IdentityEncoder.java?rev=899359&r1=899358&r2=899359&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/IdentityEncoder.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/IdentityEncoder.java Thu Jan 14 19:05:12 2010
@@ -18,9 +18,9 @@
 
 import org.apache.lucene.index.Payload;
 
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
 import java.nio.charset.Charset;
-import java.util.Arrays;
-import java.io.UnsupportedEncodingException;
 
 
 /**
@@ -30,28 +30,30 @@
 public class IdentityEncoder extends AbstractEncoder implements PayloadEncoder{
 
   protected Charset charset = Charset.forName("UTF-8");
-  protected String charsetName = "UTF-8";  //argh, stupid 1.4
+  
+  /** @deprecated This field is no longer used. Use {@link #charset} instead. */
+  @Deprecated
+  protected String charsetName = charset.name();
 
   public IdentityEncoder() {
   }
 
   public IdentityEncoder(Charset charset) {
     this.charset = charset;
+    // @deprecated, remove this in 4.0:
     charsetName = charset.name();
   }
 
 
   public Payload encode(char[] buffer, int offset, int length) {
-    //what's the most efficient way to get a byte [] from a char[] array
-    //Do we have to go through String?
-    String tmp = new String(buffer, offset, length);
-    Payload result = null;//Can we avoid allocating by knowing where using the new API?
-    try {
-      result = new Payload(tmp.getBytes(charsetName));
-    } catch (UnsupportedEncodingException e) {
-      //should never hit this, since we get the name from the Charset
+    final ByteBuffer bb = charset.encode(CharBuffer.wrap(buffer, offset, length));
+    if (bb.hasArray()) {
+      return new Payload(bb.array(), bb.arrayOffset() + bb.position(), bb.remaining());
+    } else {
+      // normally it should always have an array, but who knows?
+      final byte[] b = new byte[bb.remaining()];
+      bb.get(b);
+      return new Payload(b);
     }
-
-    return result;
   }
 }

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java?rev=899359&r1=899358&r2=899359&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java Thu Jan 14 19:05:12 2010
@@ -78,6 +78,7 @@
    * @deprecated use {@link #ReverseStringFilter(Version, TokenStream)} 
    *    instead. This constructor will be removed in Lucene 4.0
    */
+  @Deprecated
   public ReverseStringFilter(TokenStream in) {
     this(in, NOMARKER);
   }
@@ -95,6 +96,7 @@
    * @deprecated use {@link #ReverseStringFilter(Version, TokenStream, char)} 
    *    instead. This constructor will be removed in Lucene 4.0 
    */
+  @Deprecated
   public ReverseStringFilter(TokenStream in, char marker) {
     this(Version.LUCENE_30, in, marker);
   }
@@ -157,6 +159,7 @@
    * @deprecated use {@link #reverse(Version, String)} instead. This method 
    *    will be removed in Lucene 4.0
    */
+  @Deprecated
   public static String reverse( final String input ){
     return reverse(Version.LUCENE_30, input);
   }
@@ -180,6 +183,7 @@
    * @deprecated use {@link #reverse(Version, char[])} instead. This 
    *    method will be removed in Lucene 4.0
    */
+  @Deprecated
   public static void reverse( final char[] buffer ){
     reverse( buffer, 0, buffer.length );
   }
@@ -202,6 +206,7 @@
    * @deprecated use {@link #reverse(Version, char[], int)} instead. This 
    *    method will be removed in Lucene 4.0
    */
+  @Deprecated
   public static void reverse( final char[] buffer, final int len ){
     reverse( buffer, 0, len );
   }
@@ -229,6 +234,7 @@
    * @deprecated use {@link #reverse(Version, char[], int, int)} instead. This 
    *    method will be removed in Lucene 4.0
    */
+  @Deprecated
   public static void reverse(char[] buffer, int start, int len ) {
     reverseUnicode3(buffer, start, len);
   }
@@ -236,6 +242,7 @@
   /**
    * @deprecated Remove this when support for 3.0 indexes is no longer needed.
    */
+  @Deprecated
   private static void reverseUnicode3( char[] buffer, int start, int len ){
     if( len <= 1 ) return;
     int num = len>>1;

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java?rev=899359&r1=899358&r2=899359&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java Thu Jan 14 19:05:12 2010
@@ -17,7 +17,6 @@
  * limitations under the License.
  */
 
-import java.io.IOException;
 import java.io.Reader;
 import java.util.Arrays;
 import java.util.Map;
@@ -26,7 +25,9 @@
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
 import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.util.Version;
@@ -39,7 +40,7 @@
  * A default set of stopwords is used unless an alternative list is specified.
  * </p>
  */
-public final class RussianAnalyzer extends Analyzer
+public final class RussianAnalyzer extends StopwordAnalyzerBase
 {
     /**
      * List of typical Russian stopwords.
@@ -63,13 +64,6 @@
               Arrays.asList(RUSSIAN_STOP_WORDS), false));
     }
 
-    /**
-     * Contains the stopwords used with the StopFilter.
-     */
-    private final Set<?> stopSet;
-
-    private final Version matchVersion;
-
     public RussianAnalyzer(Version matchVersion) {
       this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
     }
@@ -78,6 +72,7 @@
      * Builds an analyzer with the given stop words.
      * @deprecated use {@link #RussianAnalyzer(Version, Set)} instead
      */
+    @Deprecated
     public RussianAnalyzer(Version matchVersion, String... stopwords) {
       this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords));
     }
@@ -91,8 +86,7 @@
      *          a stopword set
      */
     public RussianAnalyzer(Version matchVersion, Set<?> stopwords){
-      stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
-      this.matchVersion = matchVersion;
+      super(matchVersion, stopwords);
     }
    
     /**
@@ -100,58 +94,28 @@
      * TODO: create a Set version of this ctor
      * @deprecated use {@link #RussianAnalyzer(Version, Set)} instead
      */
+    @Deprecated
     public RussianAnalyzer(Version matchVersion, Map<?,?> stopwords)
     {
       this(matchVersion, stopwords.keySet());
     }
 
     /**
-     * Creates a {@link TokenStream} which tokenizes all the text in the 
+     * Creates {@link TokenStreamComponents} used to tokenize all the text in the 
      * provided {@link Reader}.
      *
-     * @return  A {@link TokenStream} built from a 
-     *   {@link RussianLetterTokenizer} filtered with 
-     *   {@link LowerCaseFilter}, {@link StopFilter}, 
-     *   and {@link RussianStemFilter}
-     */
-    @Override
-    public TokenStream tokenStream(String fieldName, Reader reader)
-    {
-        TokenStream result = new RussianLetterTokenizer(reader);
-        result = new LowerCaseFilter(matchVersion, result);
-        result = new StopFilter(matchVersion, result, stopSet);
-        result = new RussianStemFilter(result);
-        return result;
-    }
-    
-    private class SavedStreams {
-      Tokenizer source;
-      TokenStream result;
-    };
-    
-    /**
-     * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text 
-     * in the provided {@link Reader}.
-     *
-     * @return  A {@link TokenStream} built from a 
+     * @return {@link TokenStreamComponents} built from a 
      *   {@link RussianLetterTokenizer} filtered with 
      *   {@link LowerCaseFilter}, {@link StopFilter}, 
      *   and {@link RussianStemFilter}
      */
     @Override
-    public TokenStream reusableTokenStream(String fieldName, Reader reader) 
-      throws IOException {
-    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
-    if (streams == null) {
-      streams = new SavedStreams();
-      streams.source = new RussianLetterTokenizer(reader);
-      streams.result = new LowerCaseFilter(matchVersion, streams.source);
-      streams.result = new StopFilter(matchVersion, streams.result, stopSet);
-      streams.result = new RussianStemFilter(streams.result);
-      setPreviousTokenStream(streams);
-    } else {
-      streams.source.reset(reader);
+    protected TokenStreamComponents createComponents(String fieldName,
+        Reader reader) {
+      final Tokenizer source = new RussianLetterTokenizer(reader);
+      TokenStream result = new LowerCaseFilter(matchVersion, source);
+      result = new StopFilter(matchVersion, result, stopwords);
+      return new TokenStreamComponents(source, new RussianStemFilter(result));
+      
     }
-    return streams.result;
-  }
 }

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java?rev=899359&r1=899358&r2=899359&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java Thu Jan 14 19:05:12 2010
@@ -29,6 +29,7 @@
  * @deprecated Use {@link LowerCaseFilter} instead, which has the same
  *  functionality. This filter will be removed in Lucene 4.0
  */
+@Deprecated
 public final class RussianLowerCaseFilter extends TokenFilter
 {
     private TermAttribute termAtt;

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java?rev=899359&r1=899358&r2=899359&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java Thu Jan 14 19:05:12 2010
@@ -33,9 +33,9 @@
  */
 public final class ShingleAnalyzerWrapper extends Analyzer {
 
-  protected Analyzer defaultAnalyzer;
-  protected int maxShingleSize = 2;
-  protected boolean outputUnigrams = true;
+  private final Analyzer defaultAnalyzer;
+  private int maxShingleSize = 2;
+  private boolean outputUnigrams = true;
 
   public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) {
     super();

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java?rev=899359&r1=899358&r2=899359&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java Thu Jan 14 19:05:12 2010
@@ -182,7 +182,7 @@
           shingleBufferPosition++;
           return true;
         }
-      } else {
+      } else if (shingleBufferPosition % this.maxShingleSize == 0){
         shingleBufferPosition++;
       }
   
@@ -197,7 +197,7 @@
           termBuffer = termAtt.resizeTermBuffer(termLength);
         buf.getChars(0, termLength, termBuffer, 0);
         termAtt.setTermLength(termLength);
-        if ((! outputUnigrams) && shingleBufferPosition == 1) {
+        if ((! outputUnigrams) && shingleBufferPosition % this.maxShingleSize == 1) {
           posIncrAtt.setPositionIncrement(1);
         } else {
           posIncrAtt.setPositionIncrement(0);

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java?rev=899359&r1=899358&r2=899359&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java Thu Jan 14 19:05:12 2010
@@ -16,16 +16,18 @@
  * limitations under the License.
  */
 
-import java.io.IOException;
 import java.io.Reader;
+
+import org.apache.lucene.analysis.ReusableAnalyzerBase;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.StopAnalyzer;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.standard.StandardFilter;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;  // for javadoc
 import org.apache.lucene.util.Version;
 
 /**
@@ -35,41 +37,28 @@
  * <p><b>NOTE</b>: This class uses the same {@link Version}
  * dependent settings as {@link StandardAnalyzer}.</p>
  */
-public final class ThaiAnalyzer extends Analyzer {
+public final class ThaiAnalyzer extends ReusableAnalyzerBase {
   private final Version matchVersion;
 
   public ThaiAnalyzer(Version matchVersion) {
     this.matchVersion = matchVersion;
   }
-  
-  @Override
-  public TokenStream tokenStream(String fieldName, Reader reader) {
-    TokenStream ts = new StandardTokenizer(matchVersion, reader);
-    ts = new StandardFilter(ts);
-    ts = new ThaiWordFilter(ts);
-    ts = new StopFilter(matchVersion, ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
-    return ts;
-  }
-  
-  private class SavedStreams {
-    Tokenizer source;
-    TokenStream result;
-  };
-  
+
+  /**
+   * Creates {@link TokenStreamComponents} used to tokenize all the text in the
+   * provided {@link Reader}.
+   * 
+   * @return {@link TokenStreamComponents} built from a
+   *         {@link StandardTokenizer} filtered with {@link StandardFilter},
+   *         {@link ThaiWordFilter}, and {@link StopFilter}
+   */
   @Override
-  public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
-    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
-    if (streams == null) {
-      streams = new SavedStreams();
-      streams.source = new StandardTokenizer(matchVersion, reader);
-      streams.result = new StandardFilter(streams.source);
-      streams.result = new ThaiWordFilter(streams.result);
-      streams.result = new StopFilter(matchVersion, streams.result, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
-      setPreviousTokenStream(streams);
-    } else {
-      streams.source.reset(reader);
-      streams.result.reset(); // reset the ThaiWordFilter's state
-    }
-    return streams.result;
+  protected TokenStreamComponents createComponents(String fieldName,
+      Reader reader) {
+    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+    TokenStream result = new StandardFilter(source);
+    result = new ThaiWordFilter(result);
+    return new TokenStreamComponents(source, new StopFilter(matchVersion,
+        result, StopAnalyzer.ENGLISH_STOP_WORDS_SET));
   }
 }

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java?rev=899359&r1=899358&r2=899359&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java Thu Jan 14 19:05:12 2010
@@ -17,10 +17,10 @@
  * limitations under the License.
  */
 
-import java.io.StringReader;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
 
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.util.Version;
 
@@ -78,7 +78,9 @@
    * Test that custom stopwords work, and are not case-sensitive.
    */
   public void testCustomStopwords() throws Exception {
-    ArabicAnalyzer a = new ArabicAnalyzer(Version.LUCENE_CURRENT, new String[] { "the", "and", "a" });
+    Set<String> set = new HashSet<String>();
+    Collections.addAll(set, "the", "and", "a");
+    ArabicAnalyzer a = new ArabicAnalyzer(Version.LUCENE_CURRENT, set);
     assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick",
         "brown", "fox" });
   }

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java?rev=899359&r1=899358&r2=899359&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java Thu Jan 14 19:05:12 2010
@@ -17,10 +17,12 @@
  * limitations under the License.
  */
 
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.util.Version;
 
 /**

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java?rev=899359&r1=899358&r2=899359&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java Thu Jan 14 19:05:12 2010
@@ -17,94 +17,65 @@
  * limitations under the License.
  */
 
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
 import java.io.Reader;
 import java.io.StringReader;
-import java.net.URL;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipInputStream;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
 public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
-  private static String[] locations = {
-      "http://dfn.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip",
-      "http://surfnet.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip",
-      "http://superb-west.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip",
-      "http://voxel.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip"};
-      // too slow:
-      //"http://superb-east.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip"};
-
-  private static byte[] patternsFileContent;
+  static final File dataDir = new File(System.getProperty("dataDir", "./bin"));
+  static final File testFile = new File(dataDir, "org/apache/lucene/analysis/compound/da_UTF8.xml");
 
   @Override
   protected void setUp() throws Exception {
     super.setUp();
-    getHyphenationPatternFileContents();
   }
 
-  public void testHyphenationCompoundWordsDE() throws Exception {
-    String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
-        "Aufgabe", "Überwachung" };
+  public void testHyphenationCompoundWordsDA() throws Exception {
+    String[] dict = { "læse", "hest" };
 
-    Reader reader = getHyphenationReader("de_DR.xml");
-    if (reader == null) {
-      // we gracefully die if we have no reader
-      return;
-    }
+    Reader reader = getHyphenationReader();
 
     HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
         .getHyphenationTree(reader);
 
     HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
         new WhitespaceTokenizer(new StringReader(
-            "Rindfleischüberwachungsgesetz Drahtschere abba")), hyphenator,
+            "min veninde som er lidt af en læsehest")), hyphenator,
         dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
         CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
         CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
-    assertTokenStreamContents(tf, new String[] { "Rindfleischüberwachungsgesetz", "Rind",
-        "fleisch", "überwachung", "gesetz", "Drahtschere", "Draht", "schere",
-        "abba" }, new int[] { 0, 0, 4, 11, 23, 30, 30, 35, 42 }, new int[] {
-        29, 4, 11, 22, 29, 41, 35, 41, 46 }, new int[] { 1, 0, 0, 0, 0, 1, 0,
-        0, 1 });
+    assertTokenStreamContents(tf, 
+        new String[] { "min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest" },
+        new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 }
+    );
   }
 
   public void testHyphenationCompoundWordsDELongestMatch() throws Exception {
-    String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
-        "Aufgabe", "Überwachung", "Rindfleisch", "Überwachungsgesetz" };
-
-    Reader reader = getHyphenationReader("de_DR.xml");
-    if (reader == null) {
-      // we gracefully die if we have no reader
-      return;
-    }
+    String[] dict = { "basketball", "basket", "ball", "kurv" };
+    Reader reader = getHyphenationReader();
 
     HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
         .getHyphenationTree(reader);
 
+    // the word basket will not be added due to the longest match option
     HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
         new WhitespaceTokenizer(new StringReader(
-            "Rindfleischüberwachungsgesetz")), hyphenator, dict,
+            "basketballkurv")), hyphenator, dict,
         CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
         CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true);
-    assertTokenStreamContents(tf, new String[] { "Rindfleischüberwachungsgesetz",
-        "Rindfleisch", "fleisch", "überwachungsgesetz", "gesetz" }, new int[] {
-        0, 0, 4, 11, 23 }, new int[] { 29, 11, 11, 29, 29 }, new int[] { 1, 0,
-        0, 0, 0 });
+    assertTokenStreamContents(tf, 
+        new String[] { "basketballkurv", "basketball", "ball", "kurv" },
+        new int[] { 1, 0, 0, 0 }
+    );
+
   }
 
   public void testDumbCompoundWordsSE() throws Exception {
@@ -157,19 +128,10 @@
     String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
         "Aufgabe", "Überwachung" };
 
-    Reader reader = getHyphenationReader("de_DR.xml");
-    if (reader == null) {
-      // we gracefully die if we have no reader
-      return;
-    }
-
-    HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
-        .getHyphenationTree(reader);
-
     Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader(
         "Rindfleischüberwachungsgesetz"));
-    HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
-        wsTokenizer, hyphenator, dict,
+    DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
+        wsTokenizer, dict,
         CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
         CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
         CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
@@ -185,53 +147,7 @@
     assertEquals("Rindfleischüberwachungsgesetz", termAtt.term());
   }
 
-  private void getHyphenationPatternFileContents() {
-    if (patternsFileContent == null) {
-      try {
-        List urls = new LinkedList(Arrays.asList(locations));
-        Collections.shuffle(urls);
-        URL url = new URL((String)urls.get(0));
-        InputStream in = url.openStream();
-        byte[] buffer = new byte[1024];
-        ByteArrayOutputStream out = new ByteArrayOutputStream();
-        int count;
-
-        while ((count = in.read(buffer)) != -1) {
-          out.write(buffer, 0, count);
-        }
-        in.close();
-        out.close();
-        patternsFileContent = out.toByteArray();
-      } catch (IOException e) {
-        // we swallow all exceptions - the user might have no internet connection
-      }
-    }
-  }
-
-  private Reader getHyphenationReader(String filename) throws Exception {
-    if (patternsFileContent == null) {
-      return null;
-    }
-
-    ZipInputStream zipstream = new ZipInputStream(new ByteArrayInputStream(
-        patternsFileContent));
-
-    ZipEntry entry;
-    while ((entry = zipstream.getNextEntry()) != null) {
-      if (entry.getName().equals("offo-hyphenation/hyph/" + filename)) {
-        byte[] buffer = new byte[1024];
-        ByteArrayOutputStream outstream = new ByteArrayOutputStream();
-        int count;
-        while ((count = zipstream.read(buffer)) != -1) {
-          outstream.write(buffer, 0, count);
-        }
-        outstream.close();
-        zipstream.close();
-        return new StringReader(new String(outstream.toByteArray(),
-            "ISO-8859-1"));
-      }
-    }
-    // we never should get here
-    return null;
+  private Reader getHyphenationReader() throws Exception {
+    return new InputStreamReader(new FileInputStream(testFile), "UTF-8");
   }
 }

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java?rev=899359&r1=899358&r2=899359&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java Thu Jan 14 19:05:12 2010
@@ -40,6 +40,7 @@
   /**
    * @deprecated Remove this test when support for 3.0 indexes is no longer needed.
    */
+  @Deprecated
   public void testStopWordLegacy() throws Exception {
     assertAnalyzesTo(new CzechAnalyzer(Version.LUCENE_30), "Pokud mluvime o volnem", 
         new String[] { "mluvime", "volnem" });
@@ -53,6 +54,7 @@
   /**
    * @deprecated Remove this test when support for 3.0 indexes is no longer needed.
    */
+  @Deprecated
   public void testReusableTokenStreamLegacy() throws Exception {
     Analyzer analyzer = new CzechAnalyzer(Version.LUCENE_30);
     assertAnalyzesToReuse(analyzer, "Pokud mluvime o volnem", new String[] { "mluvime", "volnem" });
@@ -69,6 +71,7 @@
    * An input stream that always throws IOException for testing.
    * @deprecated Remove this class when the loadStopWords method is removed.
    */
+  @Deprecated
   private class UnreliableInputStream extends InputStream {
     @Override
     public int read() throws IOException {
@@ -82,6 +85,7 @@
    * this would cause a NPE when it is time to create the StopFilter.
    * @deprecated Remove this test when the loadStopWords method is removed.
    */
+  @Deprecated
   public void testInvalidStopWordFile() throws Exception {
     CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_30);
     cz.loadStopWords(new UnreliableInputStream(), "UTF-8");
@@ -94,6 +98,7 @@
    * when using reusable token streams.
    * @deprecated Remove this test when the loadStopWords method is removed.
    */
+  @Deprecated
   public void testStopWordFileReuse() throws Exception {
     CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_30);
     assertAnalyzesToReuse(cz, "Česká Republika", 

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java?rev=899359&r1=899358&r2=899359&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java Thu Jan 14 19:05:12 2010
@@ -17,11 +17,8 @@
  * limitations under the License.
  */
 
-import java.io.StringReader;
-
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.util.Version;
 
 /**

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilter.java?rev=899359&r1=899358&r2=899359&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilter.java Thu Jan 14 19:05:12 2010
@@ -24,7 +24,7 @@
 import org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
 
 /**
- * Test the Arabic Normalization Filter
+ * Test the Persian Normalization Filter
  * 
  */
 public class TestPersianNormalizationFilter extends BaseTokenStreamTestCase {

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilterTest.java?rev=899359&r1=899358&r2=899359&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilterTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilterTest.java Thu Jan 14 19:05:12 2010
@@ -34,7 +34,9 @@
 
   public void testPayloads() throws Exception {
     String test = "The quick|JJ red|JJ fox|NN jumped|VB over the lazy|JJ brown|JJ dogs|NN";
-    DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(new WhitespaceTokenizer(new StringReader(test)));
+    DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter
+      (new WhitespaceTokenizer(new StringReader(test)), 
+       DelimitedPayloadTokenFilter.DEFAULT_DELIMITER, new IdentityEncoder());
     TermAttribute termAtt = filter.getAttribute(TermAttribute.class);
     PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
     assertTermEquals("The", filter, termAtt, payAtt, null);
@@ -53,7 +55,9 @@
   public void testNext() throws Exception {
 
     String test = "The quick|JJ red|JJ fox|NN jumped|VB over the lazy|JJ brown|JJ dogs|NN";
-    DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(new WhitespaceTokenizer(new StringReader(test)));
+    DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter
+      (new WhitespaceTokenizer(new StringReader(test)), 
+       DelimitedPayloadTokenFilter.DEFAULT_DELIMITER, new IdentityEncoder());
     assertTermEquals("The", filter, null);
     assertTermEquals("quick", filter, "JJ".getBytes("UTF-8"));
     assertTermEquals("red", filter, "JJ".getBytes("UTF-8"));

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java?rev=899359&r1=899358&r2=899359&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java Thu Jan 14 19:05:12 2010
@@ -200,6 +200,93 @@
     "word", "shingle",
     "word"
   };
+  
+  public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] {
+    createToken("please divide", 0, 13),
+    createToken("please divide this", 0, 18),
+    createToken("divide this", 7, 18),
+    createToken("divide this sentence", 7, 27),
+    createToken("this sentence", 14, 27),
+    createToken("this sentence into", 14, 32),
+    createToken("sentence into", 19, 32),
+    createToken("sentence into shingles", 19, 39),
+    createToken("into shingles", 28, 39),
+  };
+
+  public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS = new int[] {
+    1, 0, 1, 0, 1, 0, 1, 0, 1
+  };
+  
+  public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS = new String[] {
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle",
+  };
+  
+  public static final Token[] FOUR_GRAM_TOKENS = new Token[] {
+    createToken("please", 0, 6),
+    createToken("please divide", 0, 13),
+    createToken("please divide this", 0, 18),
+    createToken("please divide this sentence", 0, 27),
+    createToken("divide", 7, 13),
+    createToken("divide this", 7, 18),
+    createToken("divide this sentence", 7, 27),
+    createToken("divide this sentence into", 7, 32),
+    createToken("this", 14, 18),
+    createToken("this sentence", 14, 27),
+    createToken("this sentence into", 14, 32),
+    createToken("this sentence into shingles", 14, 39),
+    createToken("sentence", 19, 27),
+    createToken("sentence into", 19, 32),
+    createToken("sentence into shingles", 19, 39),
+    createToken("into", 28, 32),
+    createToken("into shingles", 28, 39),
+    createToken("shingles", 33, 39)
+  };
+
+  public static final int[] FOUR_GRAM_POSITION_INCREMENTS = new int[] {
+    1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1
+  };
+
+  public static final String[] FOUR_GRAM_TYPES = new String[] {
+    "word", "shingle", "shingle", "shingle",
+    "word", "shingle", "shingle", "shingle",
+    "word", "shingle", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "word", "shingle",
+    "word"
+  };
+  
+  public static final Token[] FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] {
+    createToken("please divide", 0, 13),
+    createToken("please divide this", 0, 18),
+    createToken("please divide this sentence", 0, 27),
+    createToken("divide this", 7, 18),
+    createToken("divide this sentence", 7, 27),
+    createToken("divide this sentence into", 7, 32),
+    createToken("this sentence", 14, 27),
+    createToken("this sentence into", 14, 32),
+    createToken("this sentence into shingles", 14, 39),
+    createToken("sentence into", 19, 32),
+    createToken("sentence into shingles", 19, 39),
+    createToken("into shingles", 28, 39),
+  };
+
+  public static final int[] FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS = new int[] {
+    1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
+  };
+  
+  public static final String[] FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS = new String[] {
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle", "shingle",
+
+  };
 
 
   @Override
@@ -272,8 +359,25 @@
                            TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES,
                            true);
   }
-
-
+  
+  public void testTriGramFilterWithoutUnigrams() throws IOException {
+    this.shingleFilterTest(3, TEST_TOKEN, TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
+                           TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS, TRI_GRAM_TYPES_WITHOUT_UNIGRAMS,
+                           false);
+  }
+  
+  public void testFourGramFilter() throws IOException {
+    this.shingleFilterTest(4, TEST_TOKEN, FOUR_GRAM_TOKENS,
+        FOUR_GRAM_POSITION_INCREMENTS, FOUR_GRAM_TYPES,
+                           true);
+  }
+  
+  public void testFourGramFilterWithoutUnigrams() throws IOException {
+    this.shingleFilterTest(4, TEST_TOKEN, FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS,
+        FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS,
+        FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS, false);
+  }
+  
   
   public void testReset() throws Exception {
     Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence"));



Mime
View raw message