lucene-solr-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From markrmil...@apache.org
Subject svn commit: r892821 [1/3] - in /lucene/solr/trunk: ./ src/test/org/apache/solr/analysis/ src/test/test-files/solr/conf/
Date Mon, 21 Dec 2009 13:53:52 GMT
Author: markrmiller
Date: Mon Dec 21 13:53:50 2009
New Revision: 892821

URL: http://svn.apache.org/viewvc?rev=892821&view=rev
Log:
SOLR-1674: Improve analysis tests and cut over to new TokenStream API

Added:
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestArabicFilters.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestBrazilianStemFilterFactory.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestCJKTokenizerFactory.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestChineseFilterFactory.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestChineseTokenizerFactory.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestDictionaryCompoundWordTokenFilterFactory.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestDutchStemFilterFactory.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestElisionFilterFactory.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestFrenchStemFilterFactory.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestGermanStemFilterFactory.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestGreekLowerCaseFilterFactory.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestNGramFilters.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPersianNormalizationFilterFactory.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPorterStemFilterFactory.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestReverseStringFilterFactory.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestRussianFilters.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestStandardFactories.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestThaiWordFilterFactory.java
    lucene/solr/trunk/src/test/test-files/solr/conf/compoundDictionary.txt
    lucene/solr/trunk/src/test/test-files/solr/conf/frenchArticles.txt
Modified:
    lucene/solr/trunk/CHANGES.txt
    lucene/solr/trunk/src/test/org/apache/solr/analysis/AnalysisTestCase.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/BaseTokenTestCase.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsFilterTest.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterFactoryTest.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterTest.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/EnglishPorterFilterFactoryTest.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/LengthFilterTest.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/SnowballPorterFilterFactoryTest.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestBufferedTokenStream.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestCapitalizationFilter.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestCollationKeyFilterFactory.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestHyphenatedWordsFilter.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestKeepWordFilter.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestMultiWordSynonyms.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilter.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPatternReplaceFilter.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPhoneticFilter.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestRemoveDuplicatesTokenFilter.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestSynonymFilter.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestTrimFilter.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java

Modified: lucene/solr/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=892821&r1=892820&r2=892821&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Mon Dec 21 13:53:50 2009
@@ -174,6 +174,9 @@
 
 * SOLR-1662: Added Javadocs in BufferedTokenStream and fixed incorrect cloning
   in TestBufferedTokenStream (Robert Muir, Uwe Schindler via shalin)
+  
+* SOLR-1674: Improve analysis tests and cut over to new TokenStream API.
+  (Robert Muir via Mark Miller)
 
 Build
 ----------------------

Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/AnalysisTestCase.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/AnalysisTestCase.java?rev=892821&r1=892820&r2=892821&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/AnalysisTestCase.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/AnalysisTestCase.java Mon Dec 21 13:53:50 2009
@@ -17,19 +17,21 @@
 
 package org.apache.solr.analysis;
 import org.apache.solr.core.SolrConfig;
+import org.apache.solr.util.AbstractSolrTestCase;
 import org.apache.solr.util.TestHarness;
 import junit.framework.TestCase;
 
 /**
  *
  */
-abstract public class AnalysisTestCase extends TestCase {
+abstract public class AnalysisTestCase extends AbstractSolrTestCase {
   protected SolrConfig solrConfig;
   /** Creates a new instance of AnalysisTestCase */
   public AnalysisTestCase() {
   }
   
   public String getSolrConfigFile() { return "solrconfig.xml"; }
+  public String getSchemaFile() { return "schema.xml"; }
 
   public void setUp() throws Exception {
     // if you override setUp or tearDown, you better call

Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/BaseTokenTestCase.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/BaseTokenTestCase.java?rev=892821&r1=892820&r2=892821&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/BaseTokenTestCase.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/BaseTokenTestCase.java Mon Dec 21 13:53:50 2009
@@ -18,174 +18,134 @@
 package org.apache.solr.analysis;
 
 import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Iterator;
-import java.util.List;
+import java.io.StringReader;
 
-import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
-
-import junit.framework.TestCase;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 
 /**
  * General token testing helper functions
  */
 public abstract class BaseTokenTestCase extends AnalysisTestCase
 {
-  public static String tsToString(TokenStream in) throws IOException {
-    StringBuilder out = new StringBuilder();
-    Token t = in.next();
-    if (null != t)
-      out.append(new String(t.termBuffer(), 0, t.termLength()));
+  // some helpers to test Analyzers and TokenStreams:
+  // these are taken from Lucene's BaseTokenStreamTestCase
+   
+  public static void assertTokenStreamContents(TokenStream ts, String[] output,
+      int startOffsets[], int endOffsets[], String types[], int posIncrements[])
+      throws IOException {
+    assertNotNull(output);
+    assertTrue("has TermAttribute", ts.hasAttribute(TermAttribute.class));
+    TermAttribute termAtt = (TermAttribute) ts
+        .getAttribute(TermAttribute.class);
     
-    for (t = in.next(); null != t; t = in.next()) {
-      out.append(" ").append(new String(t.termBuffer(), 0, t.termLength()));
+    OffsetAttribute offsetAtt = null;
+    if (startOffsets != null || endOffsets != null) {
+      assertTrue("has OffsetAttribute", ts.hasAttribute(OffsetAttribute.class));
+      offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);
     }
-    in.close();
-    return out.toString();
-  }
-
-  public List<String> tok2str(Iterable<Token> tokLst) {
-    ArrayList<String> lst = new ArrayList<String>();
-    for ( Token t : tokLst ) {
-      lst.add( new String(t.termBuffer(), 0, t.termLength()));
-    }
-    return lst;
-  }
-
-
-  public void assertTokEqual(List<Token> a, List<Token> b) {
-    assertTokEq(a,b,false);
-    assertTokEq(b,a,false);
-  }
-
-  public void assertTokEqualOff(List<Token> a, List<Token> b) {
-    assertTokEq(a,b,true);
-    assertTokEq(b,a,true);
-  }
-
-  private void assertTokEq(List<Token> a, List<Token> b, boolean checkOff) {
-    int pos=0;
-    for (Iterator iter = a.iterator(); iter.hasNext();) {
-      Token tok = (Token)iter.next();
-      pos += tok.getPositionIncrement();
-      if (!tokAt(b, new String(tok.termBuffer(), 0, tok.termLength()), pos
-              , checkOff ? tok.startOffset() : -1
-              , checkOff ? tok.endOffset() : -1
-              )) 
-      {
-        fail(a + "!=" + b);
-      }
+    
+    TypeAttribute typeAtt = null;
+    if (types != null) {
+      assertTrue("has TypeAttribute", ts.hasAttribute(TypeAttribute.class));
+      typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class);
     }
-  }
-
-  public boolean tokAt(List<Token> lst, String val, int tokPos, int startOff, int endOff) {
-    int pos=0;
-    for (Iterator iter = lst.iterator(); iter.hasNext();) {
-      Token tok = (Token)iter.next();
-      pos += tok.getPositionIncrement();
-      if (pos==tokPos && new String(tok.termBuffer(), 0, tok.termLength()).equals(val)
-          && (startOff==-1 || tok.startOffset()==startOff)
-          && (endOff  ==-1 || tok.endOffset()  ==endOff  )
-           )
-      {
-        return true;
-      }
+    
+    PositionIncrementAttribute posIncrAtt = null;
+    if (posIncrements != null) {
+      assertTrue("has PositionIncrementAttribute", ts
+          .hasAttribute(PositionIncrementAttribute.class));
+      posIncrAtt = (PositionIncrementAttribute) ts
+          .getAttribute(PositionIncrementAttribute.class);
     }
-    return false;
-  }
-
-
-  /***
-   * Return a list of tokens according to a test string format:
-   * a b c  =>  returns List<Token> [a,b,c]
-   * a/b   => tokens a and b share the same spot (b.positionIncrement=0)
-   * a,3/b/c => a,b,c all share same position (a.positionIncrement=3, b.positionIncrement=0, c.positionIncrement=0)
-   * a,1,10,11  => "a" with positionIncrement=1, startOffset=10, endOffset=11
-   */
-  public List<Token> tokens(String str) {
-    String[] arr = str.split(" ");
-    List<Token> result = new ArrayList<Token>();
-    for (int i=0; i<arr.length; i++) {
-      String[] toks = arr[i].split("/");
-      String[] params = toks[0].split(",");
-
-      int posInc;
-      int start;
-      int end;
-
-      if (params.length > 1) {
-        posInc = Integer.parseInt(params[1]);
-      } else {
-        posInc = 1;
-      }
-
-      if (params.length > 2) {
-        start = Integer.parseInt(params[2]);
-      } else {
-        start = 0;
-      }
-
-      if (params.length > 3) {
-        end = Integer.parseInt(params[3]);
-      } else {
-        end = start + params[0].length();
-      }
-
-      Token t = new Token(params[0],start,end,"TEST");
-      t.setPositionIncrement(posInc);
+    
+    ts.reset();
+    for (int i = 0; i < output.length; i++) {
+      // extra safety to enforce, that the state is not preserved and also
+      // assign bogus values
+      ts.clearAttributes();
+      termAtt.setTermBuffer("bogusTerm");
+      if (offsetAtt != null) offsetAtt.setOffset(14584724, 24683243);
+      if (typeAtt != null) typeAtt.setType("bogusType");
+      if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657);
       
-      result.add(t);
-      for (int j=1; j<toks.length; j++) {
-        t = new Token(toks[j],0,0,"TEST");
-        t.setPositionIncrement(0);
-        result.add(t);
-      }
-    }
-    return result;
-  }
-
-  //------------------------------------------------------------------------
-  // These may be useful beyond test cases...
-  //------------------------------------------------------------------------
-
-  static List<Token> getTokens(TokenStream tstream) throws IOException {
-    List<Token> tokens = new ArrayList<Token>();
-    while (true) {
-      Token t = tstream.next();
-      if (t==null) break;
-      tokens.add(t);
-    }
-    return tokens;
-  }
-
-  public static class IterTokenStream extends TokenStream {
-    Iterator<Token> toks;
-    public IterTokenStream(Token... toks) {
-      this.toks = Arrays.asList(toks).iterator();
-    }
-    public IterTokenStream(Iterable<Token> toks) {
-      this.toks = toks.iterator();
-    }
-    public IterTokenStream(Iterator<Token> toks) {
-      this.toks = toks;
-    }
-    public IterTokenStream(String ... text) {
-      int off = 0;
-      ArrayList<Token> t = new ArrayList<Token>( text.length );
-      for( String txt : text ) {
-        t.add( new Token( txt, off, off+txt.length() ) );
-        off += txt.length() + 2;
-      }
-      this.toks = t.iterator();
-    }
-    @Override
-    public Token next() {
-      if (toks.hasNext()) {
-        return toks.next();
-      }
-      return null;
-    }
+      assertTrue("token " + i + " exists", ts.incrementToken());
+      assertEquals("term " + i, output[i], termAtt.term());
+      if (startOffsets != null) assertEquals("startOffset " + i,
+          startOffsets[i], offsetAtt.startOffset());
+      if (endOffsets != null) assertEquals("endOffset " + i, endOffsets[i],
+          offsetAtt.endOffset());
+      if (types != null) assertEquals("type " + i, types[i], typeAtt.type());
+      if (posIncrements != null) assertEquals("posIncrement " + i,
+          posIncrements[i], posIncrAtt.getPositionIncrement());
+    }
+    assertFalse("end of stream", ts.incrementToken());
+    ts.end();
+    ts.close();
+  }
+  
+  public static void assertTokenStreamContents(TokenStream ts, String[] output)
+      throws IOException {
+    assertTokenStreamContents(ts, output, null, null, null, null);
+  }
+  
+  public static void assertTokenStreamContents(TokenStream ts, String[] output,
+      String[] types) throws IOException {
+    assertTokenStreamContents(ts, output, null, null, types, null);
+  }
+  
+  public static void assertTokenStreamContents(TokenStream ts, String[] output,
+      int[] posIncrements) throws IOException {
+    assertTokenStreamContents(ts, output, null, null, null, posIncrements);
+  }
+  
+  public static void assertTokenStreamContents(TokenStream ts, String[] output,
+      int startOffsets[], int endOffsets[]) throws IOException {
+    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null);
+  }
+  
+  public static void assertTokenStreamContents(TokenStream ts, String[] output,
+      int startOffsets[], int endOffsets[], int[] posIncrements)
+      throws IOException {
+    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null,
+        posIncrements);
+  }
+  
+  public static void assertAnalyzesTo(Analyzer a, String input,
+      String[] output, int startOffsets[], int endOffsets[], String types[],
+      int posIncrements[]) throws IOException {
+    assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)),
+        output, startOffsets, endOffsets, types, posIncrements);
+  }
+  
+  public static void assertAnalyzesTo(Analyzer a, String input, String[] output)
+      throws IOException {
+    assertAnalyzesTo(a, input, output, null, null, null, null);
+  }
+  
+  public static void assertAnalyzesTo(Analyzer a, String input,
+      String[] output, String[] types) throws IOException {
+    assertAnalyzesTo(a, input, output, null, null, types, null);
+  }
+  
+  public static void assertAnalyzesTo(Analyzer a, String input,
+      String[] output, int[] posIncrements) throws IOException {
+    assertAnalyzesTo(a, input, output, null, null, null, posIncrements);
+  }
+  
+  public static void assertAnalyzesTo(Analyzer a, String input,
+      String[] output, int startOffsets[], int endOffsets[]) throws IOException {
+    assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, null);
+  }
+  
+  public static void assertAnalyzesTo(Analyzer a, String input,
+      String[] output, int startOffsets[], int endOffsets[], int[] posIncrements)
+      throws IOException {
+    assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null,
+        posIncrements);
   }
 }

Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java?rev=892821&r1=892820&r2=892821&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java Mon Dec 21 13:53:50 2009
@@ -17,9 +17,13 @@
  * limitations under the License.
  */
 
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.solr.util.AbstractSolrTestCase;
 import org.apache.solr.common.ResourceLoader;
 
+import java.io.StringReader;
 import java.util.Set;
 import java.util.Map;
 import java.util.HashMap;
@@ -29,7 +33,7 @@
  * used by the StopFilterFactoryTest TODO: consider creating separate test files
  * so this won't break if stop filter test files change
  **/
-public class CommonGramsFilterFactoryTest extends AbstractSolrTestCase {
+public class CommonGramsFilterFactoryTest extends BaseTokenTestCase {
   public String getSchemaFile() {
     return "schema-stop-keep.xml";
   }
@@ -66,4 +70,23 @@
         .isIgnoreCase() == true);
 
   }
+  
+  /**
+   * If no words are provided, then a set of english default stopwords is used.
+   */
+  public void testDefaults() throws Exception {
+    ResourceLoader loader = solrConfig.getResourceLoader();
+    assertTrue("loader is null and it shouldn't be", loader != null);
+    CommonGramsFilterFactory factory = new CommonGramsFilterFactory();
+    Map<String, String> args = new HashMap<String, String>();
+    factory.init(args);
+    factory.inform(loader);
+    Set words = factory.getCommonWords();
+    assertTrue("words is null and it shouldn't be", words != null);
+    assertTrue(words.contains("the"));
+    Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader("testing the factory"));
+    TokenStream stream = factory.create(tokenizer);
+    assertTokenStreamContents(stream, 
+        new String[] { "testing", "testing_the", "the", "the_factory", "factory" });
+  }
 }

Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsFilterTest.java?rev=892821&r1=892820&r2=892821&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsFilterTest.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsFilterTest.java Mon Dec 21 13:53:50 2009
@@ -16,29 +16,20 @@
  */
 package org.apache.solr.analysis;
 
-import java.io.IOException;
+import java.io.Reader;
 import java.io.StringReader;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.LinkedHashMap;
-import java.util.Map;
 import java.util.Set;
-import java.util.StringTokenizer;
-import java.util.Map.Entry;
 
-import junit.framework.TestCase;
-
-import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
-import org.apache.solr.analysis.TestBufferedTokenStream.AB_AAB_Stream;
 
 /**
  * Tests CommonGramsQueryFilter
  */
-public class CommonGramsFilterTest extends TestCase {
+public class CommonGramsFilterTest extends BaseTokenTestCase {
   private static final String[] commonWords = { "s", "a", "b", "c", "d", "the",
       "of" };
   
@@ -63,18 +54,6 @@
     assertEquals("How", term.term());
   }
   
-  public void testCommonGramsQueryFilter() throws Exception {
-    Set<Map.Entry<String, String>> input2expectedSet = initQueryMap().entrySet();
-    for (Iterator<Entry<String, String>> i = input2expectedSet.iterator(); i
-        .hasNext();) {
-      Map.Entry<String, String> me = i.next();
-      String input = me.getKey();
-      String expected = me.getValue();
-      String message = "message: input value is: " + input;
-      assertEquals(message, expected, testFilter(input, "query"));
-    }
-  }
-  
   public void testQueryReset() throws Exception {
     final String input = "How the s a brown s cow d like A B thing?";
     WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
@@ -93,18 +72,6 @@
     assertEquals("How_the", term.term());
   }
   
-  public void testCommonGramsFilter() throws Exception {
-    Set<Map.Entry<String, String>> input2expectedSet = initMap().entrySet();
-    for (Iterator<Entry<String, String>> i = input2expectedSet.iterator(); i
-        .hasNext();) {
-      Map.Entry<String, String> me = i.next();
-      String input = me.getKey();
-      String expected = me.getValue();
-      String message = "message: input value is: " + input;
-      assertEquals(message, expected, testFilter(input, "common"));
-    }
-  }
-  
   /**
    * This is for testing CommonGramsQueryFilter which outputs a set of tokens
    * optimized for querying with only one token at each position, either a
@@ -116,150 +83,226 @@
    * 
    * @return Map<String,String>
    */
-  private static Map<String, String> initQueryMap() {
-    Map<String, String> input2expected = new LinkedHashMap<String, String>();
+  public void testCommonGramsQueryFilter() throws Exception {
+    Analyzer a = new Analyzer() {    
+      @Override
+      public TokenStream tokenStream(String field, Reader in) {
+        return new CommonGramsQueryFilter(new CommonGramsFilter(
+            new WhitespaceTokenizer(in), commonWords));
+      } 
+    };
 
     // Stop words used below are "of" "the" and "s"
     
     // two word queries
-    input2expected.put("brown fox", "/brown/fox");
-    input2expected.put("the fox", "/the_fox");
-    input2expected.put("fox of", "/fox_of");
-    input2expected.put("of the", "/of_the");
+    assertAnalyzesTo(a, "brown fox", 
+        new String[] { "brown", "fox" });
+    assertAnalyzesTo(a, "the fox", 
+        new String[] { "the_fox" });
+    assertAnalyzesTo(a, "fox of", 
+        new String[] { "fox_of" });
+    assertAnalyzesTo(a, "of the", 
+        new String[] { "of_the" });
     
     // one word queries
-    input2expected.put("the", "/the");
-    input2expected.put("foo", "/foo");
+    assertAnalyzesTo(a, "the", 
+        new String[] { "the" });
+    assertAnalyzesTo(a, "foo", 
+        new String[] { "foo" });
 
     // 3 word combinations s=stopword/common word n=not a stop word
-    input2expected.put("n n n", "/n/n/n");
-    input2expected.put("quick brown fox", "/quick/brown/fox");
-
-    input2expected.put("n n s", "/n/n_s");
-    input2expected.put("quick brown the", "/quick/brown_the");
-
-    input2expected.put("n s n", "/n_s/s_n");
-    input2expected.put("quick the brown", "/quick_the/the_brown");
-
-    input2expected.put("n s s", "/n_s/s_s");
-    input2expected.put("fox of the", "/fox_of/of_the");
-
-    input2expected.put("s n n", "/s_n/n/n");
-    input2expected.put("the quick brown", "/the_quick/quick/brown");
-
-    input2expected.put("s n s", "/s_n/n_s");
-    input2expected.put("the fox of", "/the_fox/fox_of");
-
-    input2expected.put("s s n", "/s_s/s_n");
-    input2expected.put("of the fox", "/of_the/the_fox");
-
-    input2expected.put("s s s", "/s_s/s_s");
-    input2expected.put("of the of", "/of_the/the_of");
-
-    return input2expected;
+    assertAnalyzesTo(a, "n n n", 
+        new String[] { "n", "n", "n" });
+    assertAnalyzesTo(a, "quick brown fox", 
+        new String[] { "quick", "brown", "fox" });
+
+    assertAnalyzesTo(a, "n n s", 
+        new String[] { "n", "n_s" });
+    assertAnalyzesTo(a, "quick brown the", 
+        new String[] { "quick", "brown_the" });
+
+    assertAnalyzesTo(a, "n s n", 
+        new String[] { "n_s", "s_n" });
+    assertAnalyzesTo(a, "quick the brown", 
+        new String[] { "quick_the", "the_brown" });
+
+    assertAnalyzesTo(a, "n s s", 
+        new String[] { "n_s", "s_s" });
+    assertAnalyzesTo(a, "fox of the", 
+        new String[] { "fox_of", "of_the" });
+
+    assertAnalyzesTo(a, "s n n", 
+        new String[] { "s_n", "n", "n" });
+    assertAnalyzesTo(a, "the quick brown", 
+        new String[] { "the_quick", "quick", "brown" });
+
+    assertAnalyzesTo(a, "s n s", 
+        new String[] { "s_n", "n_s" });
+    assertAnalyzesTo(a, "the fox of", 
+        new String[] { "the_fox", "fox_of" });
+
+    assertAnalyzesTo(a, "s s n", 
+        new String[] { "s_s", "s_n" });
+    assertAnalyzesTo(a, "of the fox", 
+        new String[] { "of_the", "the_fox" });
+
+    assertAnalyzesTo(a, "s s s", 
+        new String[] { "s_s", "s_s" });
+    assertAnalyzesTo(a, "of the of", 
+        new String[] { "of_the", "the_of" });
   }
   
-  private static Map<String, String> initMap() {
-    Map<String, String> input2expected = new HashMap<String, String>();
+  public void testCommonGramsFilter() throws Exception {
+    Analyzer a = new Analyzer() {    
+      @Override
+      public TokenStream tokenStream(String field, Reader in) {
+        return new CommonGramsFilter(
+            new WhitespaceTokenizer(in), commonWords);
+      } 
+    };
 
     // Stop words used below are "of" "the" and "s"
     // one word queries
-    input2expected.put("the", "/the");
-    input2expected.put("foo", "/foo");
+    assertAnalyzesTo(a, "the", new String[] { "the" });
+    assertAnalyzesTo(a, "foo", new String[] { "foo" });
 
     // two word queries
-    input2expected.put("brown fox", "/brown/fox");
-    input2expected.put("the fox", "/the,the_fox/fox");
-    input2expected.put("fox of", "/fox,fox_of/of");
-    input2expected.put("of the", "/of,of_the/the");
+    assertAnalyzesTo(a, "brown fox", 
+        new String[] { "brown", "fox" }, 
+        new int[] { 1, 1 });
+    assertAnalyzesTo(a, "the fox", 
+        new String[] { "the", "the_fox", "fox" }, 
+        new int[] { 1, 0, 1 });
+    assertAnalyzesTo(a, "fox of", 
+        new String[] { "fox", "fox_of", "of" }, 
+        new int[] { 1, 0, 1 });
+    assertAnalyzesTo(a, "of the", 
+        new String[] { "of", "of_the", "the" }, 
+        new int[] { 1, 0, 1 });
 
     // 3 word combinations s=stopword/common word n=not a stop word
-    input2expected.put("n n n", "/n/n/n");
-    input2expected.put("quick brown fox", "/quick/brown/fox");
-
-    input2expected.put("n n s", "/n/n,n_s/s");
-    input2expected.put("quick brown the", "/quick/brown,brown_the/the");
-
-    input2expected.put("n s n", "/n,n_s/s,s_n/n");
-    input2expected.put("quick the fox", "/quick,quick_the/the,the_fox/fox");
-
-    input2expected.put("n s s", "/n,n_s/s,s_s/s");
-    input2expected.put("fox of the", "/fox,fox_of/of,of_the/the");
-
-    input2expected.put("s n n", "/s,s_n/n/n");
-    input2expected.put("the quick brown", "/the,the_quick/quick/brown");
-
-    input2expected.put("s n s", "/s,s_n/n,n_s/s");
-    input2expected.put("the fox of", "/the,the_fox/fox,fox_of/of");
-
-    input2expected.put("s s n", "/s,s_s/s,s_n/n");
-    input2expected.put("of the fox", "/of,of_the/the,the_fox/fox");
-
-    input2expected.put("s s s", "/s,s_s/s,s_s/s");
-    input2expected.put("of the of", "/of,of_the/the,the_of/of");
-
-    return input2expected;
+    assertAnalyzesTo(a, "n n n", 
+        new String[] { "n", "n", "n" }, 
+        new int[] { 1, 1, 1 });
+    assertAnalyzesTo(a, "quick brown fox", 
+        new String[] { "quick", "brown", "fox" }, 
+        new int[] { 1, 1, 1 });
+
+    assertAnalyzesTo(a, "n n s", 
+        new String[] { "n", "n", "n_s", "s" }, 
+        new int[] { 1, 1, 0, 1 });
+    assertAnalyzesTo(a, "quick brown the", 
+        new String[] { "quick", "brown", "brown_the", "the" }, 
+        new int[] { 1, 1, 0, 1 });
+
+    assertAnalyzesTo(a, "n s n", 
+        new String[] { "n", "n_s", "s", "s_n", "n" }, 
+        new int[] { 1, 0, 1, 0, 1 });
+    assertAnalyzesTo(a, "quick the fox", 
+        new String[] { "quick", "quick_the", "the", "the_fox", "fox" }, 
+        new int[] { 1, 0, 1, 0, 1 });
+
+    assertAnalyzesTo(a, "n s s", 
+        new String[] { "n", "n_s", "s", "s_s", "s" }, 
+        new int[] { 1, 0, 1, 0, 1 });
+    assertAnalyzesTo(a, "fox of the", 
+        new String[] { "fox", "fox_of", "of", "of_the", "the" }, 
+        new int[] { 1, 0, 1, 0, 1 });
+
+    assertAnalyzesTo(a, "s n n", 
+        new String[] { "s", "s_n", "n", "n" }, 
+        new int[] { 1, 0, 1, 1 });
+    assertAnalyzesTo(a, "the quick brown", 
+        new String[] { "the", "the_quick", "quick", "brown" }, 
+        new int[] { 1, 0, 1, 1 });
+
+    assertAnalyzesTo(a, "s n s", 
+        new String[] { "s", "s_n", "n", "n_s", "s" }, 
+        new int[] { 1, 0, 1, 0, 1 });
+    assertAnalyzesTo(a, "the fox of", 
+        new String[] { "the", "the_fox", "fox", "fox_of", "of" }, 
+        new int[] { 1, 0, 1, 0, 1 });
+
+    assertAnalyzesTo(a, "s s n", 
+        new String[] { "s", "s_s", "s", "s_n", "n" }, 
+        new int[] { 1, 0, 1, 0, 1 });
+    assertAnalyzesTo(a, "of the fox", 
+        new String[] { "of", "of_the", "the", "the_fox", "fox" }, 
+        new int[] { 1, 0, 1, 0, 1 });
+
+    assertAnalyzesTo(a, "s s s", 
+        new String[] { "s", "s_s", "s", "s_s", "s" }, 
+        new int[] { 1, 0, 1, 0, 1 });
+    assertAnalyzesTo(a, "of the of", 
+        new String[] { "of", "of_the", "the", "the_of", "of" }, 
+        new int[] { 1, 0, 1, 0, 1 });
   }
   
-  /*
-   * Helper methodsCopied and from CDL XTF BigramsStopFilter.java and slightly
-   * modified to use with CommonGrams http://xtf.wiki.sourceforge.net/
+  /**
+   * Test that CommonGramsFilter works correctly in case-insensitive mode
    */
+  public void testCaseSensitive() throws Exception {
+    final String input = "How The s a brown s cow d like A B thing?";
+    WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
+    Set common = CommonGramsFilter.makeCommonSet(commonWords);
+    TokenFilter cgf = new CommonGramsFilter(wt, common, false);
+    assertTokenStreamContents(cgf, new String[] {"How", "The", "The_s", "s",
+        "s_a", "a", "a_brown", "brown", "brown_s", "s", "s_cow", "cow",
+        "cow_d", "d", "d_like", "like", "A", "B", "thing?"});
+  }
+  
   /**
-   * Very simple tokenizer that breaks up a string into a series of Lucene
-   * {@link Token Token}s.
+   * Test CommonGramsQueryFilter in the case that the last word is a stopword
    */
-  static class StringTokenStream extends TokenStream {
-    private String str;
-
-    private int prevEnd = 0;
-
-    private StringTokenizer tok;
-
-    private int count = 0;
-
-    public StringTokenStream(String str, String delim) {
-      this.str = str;
-      tok = new StringTokenizer(str, delim);
-    }
-
-    public Token next() {
-      if (!tok.hasMoreTokens())
-        return null;
-      count++;
-      String term = tok.nextToken();
-      Token t = new Token(term, str.indexOf(term, prevEnd), str.indexOf(term,
-          prevEnd)
-          + term.length(), "word");
-      prevEnd = t.endOffset();
-      return t;
-    }
+  public void testLastWordisStopWord() throws Exception {
+    final String input = "dog the";
+    WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
+    CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
+    TokenFilter nsf = new CommonGramsQueryFilter(cgf);
+    assertTokenStreamContents(nsf, new String[] { "dog_the" });
   }
   
-  public static String testFilter(String in, String type) throws IOException {
-    TokenStream nsf;
-    StringTokenStream ts = new StringTokenStream(in, " .");
-    if (type.equals("query")) {
-      CommonGramsFilter cgf = new CommonGramsFilter(ts, commonWords);
-      nsf = new CommonGramsQueryFilter(cgf);
-    } else {
-      nsf = new CommonGramsFilter(ts, commonWords);
-    }
-
-    StringBuffer outBuf = new StringBuffer();
-    while (true) {
-      Token t = nsf.next();
-      if (t == null)
-        break;
-      for (int i = 0; i < t.getPositionIncrement(); i++)
-        outBuf.append('/');
-      if (t.getPositionIncrement() == 0)
-        outBuf.append(',');
-      outBuf.append(t.term());
-    }
-
-    String out = outBuf.toString();
-    out = out.replaceAll(" ", "");
-    return out;
+  /**
+   * Test CommonGramsQueryFilter in the case that the first word is a stopword
+   */
+  public void testFirstWordisStopWord() throws Exception {
+    final String input = "the dog";
+    WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
+    CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
+    TokenFilter nsf = new CommonGramsQueryFilter(cgf);
+    assertTokenStreamContents(nsf, new String[] { "the_dog" });
+  }
+  
+  /**
+   * Test CommonGramsQueryFilter in the case of a single (stop)word query
+   */
+  public void testOneWordQueryStopWord() throws Exception {
+    final String input = "the";
+    WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
+    CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
+    TokenFilter nsf = new CommonGramsQueryFilter(cgf);
+    assertTokenStreamContents(nsf, new String[] { "the" });
+  }
+  
+  /**
+   * Test CommonGramsQueryFilter in the case of a single word query
+   */
+  public void testOneWordQuery() throws Exception {
+    final String input = "monster";
+    WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
+    CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
+    TokenFilter nsf = new CommonGramsQueryFilter(cgf);
+    assertTokenStreamContents(nsf, new String[] { "monster" });
+  }
+  
+  /**
+   * Test CommonGramsQueryFilter when first and last words are stopwords.
+   */
+  public void TestFirstAndLastStopWord() throws Exception {
+    final String input = "the of";
+    WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
+    CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
+    TokenFilter nsf = new CommonGramsQueryFilter(cgf);
+    assertTokenStreamContents(nsf, new String[] { "the_of" });
   }
 }

Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java?rev=892821&r1=892820&r2=892821&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java Mon Dec 21 13:53:50 2009
@@ -16,9 +16,12 @@
  */
 package org.apache.solr.analysis;
 
-import org.apache.solr.util.AbstractSolrTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.solr.common.ResourceLoader;
 
+import java.io.StringReader;
 import java.util.Set;
 import java.util.Map;
 import java.util.HashMap;
@@ -28,7 +31,7 @@
  * used by the StopFilterFactoryTest TODO: consider creating separate test files
  * so this won't break if stop filter test files change
  **/
-public class CommonGramsQueryFilterFactoryTest extends AbstractSolrTestCase {
+public class CommonGramsQueryFilterFactoryTest extends BaseTokenTestCase {
   public String getSchemaFile() {
     return "schema-stop-keep.xml";
   }
@@ -65,4 +68,23 @@
         .isIgnoreCase() == true);
 
   }
+  
+  /**
+   * If no words are provided, then a set of english default stopwords is used.
+   */
+  public void testDefaults() throws Exception {
+    ResourceLoader loader = solrConfig.getResourceLoader();
+    assertTrue("loader is null and it shouldn't be", loader != null);
+    CommonGramsQueryFilterFactory factory = new CommonGramsQueryFilterFactory();
+    Map<String, String> args = new HashMap<String, String>();
+    factory.init(args);
+    factory.inform(loader);
+    Set words = factory.getCommonWords();
+    assertTrue("words is null and it shouldn't be", words != null);
+    assertTrue(words.contains("the"));
+    Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader("testing the factory"));
+    TokenStream stream = factory.create(tokenizer);
+    assertTokenStreamContents(stream, 
+        new String[] { "testing_the", "the_factory" });
+  }
 }

Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterFactoryTest.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterFactoryTest.java?rev=892821&r1=892820&r2=892821&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterFactoryTest.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterFactoryTest.java Mon Dec 21 13:53:50 2009
@@ -16,36 +16,24 @@
  */
 package org.apache.solr.analysis;
 
+import java.io.StringReader;
 import java.util.HashMap;
 import java.util.Map;
 
-import junit.framework.TestCase;
-
-import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.solr.analysis.BaseTokenTestCase.IterTokenStream;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
-public class DoubleMetaphoneFilterFactoryTest extends TestCase {
+public class DoubleMetaphoneFilterFactoryTest extends BaseTokenTestCase {
 
   public void testDefaults() throws Exception {
     DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
     factory.init(new HashMap<String, String>());
-    TokenStream inputStream = new IterTokenStream("international");
+    TokenStream inputStream = new WhitespaceTokenizer(new StringReader("international"));
 
     TokenStream filteredStream = factory.create(inputStream);
-
     assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
-
-    Token token = filteredStream.next(new Token());
-    assertEquals(13, token.termLength());
-    assertEquals("international", new String(token.termBuffer(), 0, token
-        .termLength()));
-
-    token = filteredStream.next(new Token());
-    assertEquals(4, token.termLength());
-    assertEquals("ANTR", new String(token.termBuffer(), 0, token.termLength()));
-
-    assertNull(filteredStream.next(new Token()));
+    assertTokenStreamContents(filteredStream, new String[] { "international", "ANTR" });
   }
 
   public void testSettingSizeAndInject() throws Exception {
@@ -55,17 +43,31 @@
     parameters.put("maxCodeLength", "8");
     factory.init(parameters);
 
-    TokenStream inputStream = new IterTokenStream("international");
+    TokenStream inputStream = new WhitespaceTokenizer(new StringReader("international"));
 
     TokenStream filteredStream = factory.create(inputStream);
-
     assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
+    assertTokenStreamContents(filteredStream, new String[] { "ANTRNXNL" });
+  }
+  
+  /**
+   * Ensure that reset() removes any state (buffered tokens)
+   */
+  public void testReset() throws Exception {
+    DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
+    factory.init(new HashMap<String, String>());
+    TokenStream inputStream = new WhitespaceTokenizer(new StringReader("international"));
 
-    Token token = filteredStream.next(new Token());
-    assertEquals(8, token.termLength());
-    assertEquals("ANTRNXNL", new String(token.termBuffer(), 0, token
-        .termLength()));
-
-    assertNull(filteredStream.next(new Token()));
+    TokenStream filteredStream = factory.create(inputStream);
+    TermAttribute termAtt = (TermAttribute) filteredStream.addAttribute(TermAttribute.class);
+    assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
+    
+    assertTrue(filteredStream.incrementToken());
+    assertEquals(13, termAtt.termLength());
+    assertEquals("international", termAtt.term());
+    filteredStream.reset();
+    
+    // ensure there are no more tokens, such as ANTRNXNL
+    assertFalse(filteredStream.incrementToken());
   }
 }

Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterTest.java?rev=892821&r1=892820&r2=892821&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterTest.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterTest.java Mon Dec 21 13:53:50 2009
@@ -16,94 +16,52 @@
  */
 package org.apache.solr.analysis;
 
-import junit.framework.TestCase;
+import java.io.StringReader;
 
-import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.solr.analysis.BaseTokenTestCase.IterTokenStream;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
 
-public class DoubleMetaphoneFilterTest extends TestCase {
+public class DoubleMetaphoneFilterTest extends BaseTokenTestCase {
 
   public void testSize4FalseInject() throws Exception {
-    TokenStream stream = new IterTokenStream("international");
+    TokenStream stream = new WhitespaceTokenizer(new StringReader("international"));
     TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
-
-    Token token = filter.next(new Token());
-    assertEquals(4, token.termLength());
-    assertEquals("ANTR", new String(token.termBuffer(), 0, token.termLength()));
-
-    assertNull(filter.next(new Token()));
+    assertTokenStreamContents(filter, new String[] { "ANTR" });
   }
 
   public void testSize4TrueInject() throws Exception {
-    TokenStream stream = new IterTokenStream("international");
+    TokenStream stream = new WhitespaceTokenizer(new StringReader("international"));
     TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true);
-
-    Token token = filter.next(new Token());
-    assertEquals(13, token.termLength());
-    assertEquals("international", new String(token.termBuffer(), 0, token
-        .termLength()));
-
-    token = filter.next(new Token());
-    assertEquals(4, token.termLength());
-    assertEquals("ANTR", new String(token.termBuffer(), 0, token.termLength()));
-
-    assertNull(filter.next(new Token()));
+    assertTokenStreamContents(filter, new String[] { "international", "ANTR" });
   }
 
   public void testAlternateInjectFalse() throws Exception {
-    TokenStream stream = new IterTokenStream("Kuczewski");
+    TokenStream stream = new WhitespaceTokenizer(new StringReader("Kuczewski"));
     TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
-
-    Token token = filter.next(new Token());
-    assertEquals(4, token.termLength());
-    assertEquals("KSSK", new String(token.termBuffer(), 0, token.termLength()));
-
-    token = filter.next(new Token());
-    assertEquals(4, token.termLength());
-    assertEquals("KXFS", new String(token.termBuffer(), 0, token.termLength()));
-    assertNull(filter.next(new Token()));
+    assertTokenStreamContents(filter, new String[] { "KSSK", "KXFS" });
   }
 
   public void testSize8FalseInject() throws Exception {
-    TokenStream stream = new IterTokenStream("international");
+    TokenStream stream = new WhitespaceTokenizer(new StringReader("international"));
     TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
-
-    Token token = filter.next(new Token());
-    assertEquals(8, token.termLength());
-    assertEquals("ANTRNXNL", new String(token.termBuffer(), 0, token
-        .termLength()));
-
-    assertNull(filter.next(new Token()));
+    assertTokenStreamContents(filter, new String[] { "ANTRNXNL" });
   }
 
   public void testNonConvertableStringsWithInject() throws Exception {
-    TokenStream stream = new IterTokenStream(
-        new String[] { "12345", "#$%@#^%&" });
+    TokenStream stream = new WhitespaceTokenizer(new StringReader("12345 #$%@#^%&"));
     TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true);
-
-    Token token = filter.next(new Token());
-    assertEquals(5, token.termLength());
-    assertEquals("12345", new String(token.termBuffer(), 0, token.termLength()));
-
-    token = filter.next(new Token());
-    assertEquals(8, token.termLength());
-    assertEquals("#$%@#^%&", new String(token.termBuffer(), 0, token
-        .termLength()));
+    assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
   }
 
   public void testNonConvertableStringsWithoutInject() throws Exception {
-    TokenStream stream = new IterTokenStream(
-        new String[] { "12345", "#$%@#^%&" });
+    TokenStream stream = new WhitespaceTokenizer(new StringReader("12345 #$%@#^%&"));
     TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
-
-    assertEquals("12345", filter.next(new Token()).term());
+    assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
     
     // should have something after the stream
-    stream = new IterTokenStream(
-        new String[] { "12345", "#$%@#^%&", "hello" });
+    stream = new WhitespaceTokenizer(new StringReader("12345 #$%@#^%& hello"));
     filter = new DoubleMetaphoneFilter(stream, 8, false);
-    assertNotNull(filter.next(new Token()));
+    assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&", "HL" });
   }
 
 }

Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/EnglishPorterFilterFactoryTest.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/EnglishPorterFilterFactoryTest.java?rev=892821&r1=892820&r2=892821&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/EnglishPorterFilterFactoryTest.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/EnglishPorterFilterFactoryTest.java Mon Dec 21 13:53:50 2009
@@ -16,11 +16,17 @@
  * limitations under the License.
  */
 
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.util.StrUtils;
 import org.tartarus.snowball.ext.EnglishStemmer;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.StringReader;
+import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -32,11 +38,11 @@
   public void test() throws IOException {
     EnglishStemmer stemmer = new EnglishStemmer();
     String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
-    StringBuilder gold = new StringBuilder();
+    String[] gold = new String[test.length];
     for (int i = 0; i < test.length; i++) {
       stemmer.setCurrent(test[i]);
       stemmer.stem();
-      gold.append(stemmer.getCurrent()).append(' ');
+      gold[i] = stemmer.getCurrent();
     }
 
     EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
@@ -44,21 +50,23 @@
 
     factory.init(args);
     factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>()));
-    String out = tsToString(factory.create(new IterTokenStream(test)));
-    assertEquals(gold.toString().trim(), out);
+    Tokenizer tokenizer = new WhitespaceTokenizer(
+        new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
+    TokenStream stream = factory.create(tokenizer);
+    assertTokenStreamContents(stream, gold);
   }
 
   public void testProtected() throws Exception {
     EnglishStemmer stemmer = new EnglishStemmer();
     String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
-    StringBuilder gold = new StringBuilder();
+    String[] gold = new String[test.length];
     for (int i = 0; i < test.length; i++) {
       if (test[i].equals("fledgling") == false && test[i].equals("banks") == false) {
         stemmer.setCurrent(test[i]);
         stemmer.stem();
-        gold.append(stemmer.getCurrent()).append(' ');
+        gold[i] = stemmer.getCurrent();
       } else {
-        gold.append(test[i]).append(' ');
+        gold[i] = test[i];
       }
     }
 
@@ -69,8 +77,10 @@
     List<String> lines = new ArrayList<String>();
     Collections.addAll(lines, "banks", "fledgling");
     factory.inform(new LinesMockSolrResourceLoader(lines));
-    String out = tsToString(factory.create(new IterTokenStream(test)));
-    assertEquals(gold.toString().trim(), out);
+    Tokenizer tokenizer = new WhitespaceTokenizer(
+        new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
+    TokenStream stream = factory.create(tokenizer);
+    assertTokenStreamContents(stream, gold);
   }
 
   class LinesMockSolrResourceLoader implements ResourceLoader {

Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/LengthFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/LengthFilterTest.java?rev=892821&r1=892820&r2=892821&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/LengthFilterTest.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/LengthFilterTest.java Mon Dec 21 13:53:50 2009
@@ -17,9 +17,13 @@
  */
 
 import java.io.IOException;
+import java.io.StringReader;
 import java.util.HashMap;
 import java.util.Map;
 
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+
 public class LengthFilterTest extends BaseTokenTestCase {
 
   public void test() throws IOException {
@@ -28,9 +32,8 @@
     args.put(LengthFilterFactory.MIN_KEY, String.valueOf(4));
     args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10));
     factory.init(args);
-    String[] test = {"foo", "foobar", "super-duper-trooper"};
-    String gold = "foobar";
-    String out = tsToString(factory.create(new IterTokenStream(test)));
-    assertEquals(gold.toString(), out);
+    String test = "foo foobar super-duper-trooper";
+    TokenStream stream = factory.create(new WhitespaceTokenizer(new StringReader(test)));
+    assertTokenStreamContents(stream, new String[] { "foobar" });
   }
 }
\ No newline at end of file

Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/SnowballPorterFilterFactoryTest.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/SnowballPorterFilterFactoryTest.java?rev=892821&r1=892820&r2=892821&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/SnowballPorterFilterFactoryTest.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/SnowballPorterFilterFactoryTest.java Mon Dec 21 13:53:50 2009
@@ -16,11 +16,18 @@
  * limitations under the License.
  */
 
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.util.StrUtils;
 import org.tartarus.snowball.ext.EnglishStemmer;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -32,11 +39,11 @@
   public void test() throws IOException {
     EnglishStemmer stemmer = new EnglishStemmer();
     String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
-    StringBuilder gold = new StringBuilder();
-    for (String aTest : test) {
-      stemmer.setCurrent(aTest);
+    String[] gold = new String[test.length];
+    for (int i = 0; i < test.length; i++) {
+      stemmer.setCurrent(test[i]);
       stemmer.stem();
-      gold.append(stemmer.getCurrent()).append(' ');
+      gold[i] = stemmer.getCurrent();
     }
 
     SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory();
@@ -45,21 +52,27 @@
 
     factory.init(args);
     factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>()));
-    String out = tsToString(factory.create(new IterTokenStream(test)));
-    assertEquals(gold.toString().trim(), out);
+    Tokenizer tokenizer = new WhitespaceTokenizer(
+        new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
+    TokenStream stream = factory.create(tokenizer);
+    assertTokenStreamContents(stream, gold);
   }
 
-  public void testProtected() throws Exception {
+  /**
+   * Tests the protected words mechanism of EnglishPorterFilterFactory
+   */
+  @Deprecated
+  public void testProtectedOld() throws Exception {
     EnglishStemmer stemmer = new EnglishStemmer();
     String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
-    StringBuilder gold = new StringBuilder();
+    String[] gold = new String[test.length];
     for (int i = 0; i < test.length; i++) {
       if (test[i].equals("fledgling") == false && test[i].equals("banks") == false) {
         stemmer.setCurrent(test[i]);
         stemmer.stem();
-        gold.append(stemmer.getCurrent()).append(' ');
+        gold[i] = stemmer.getCurrent();
       } else {
-        gold.append(test[i]).append(' ');
+        gold[i] = test[i];
       }
     }
 
@@ -70,8 +83,10 @@
     List<String> lines = new ArrayList<String>();
     Collections.addAll(lines, "banks", "fledgling");
     factory.inform(new LinesMockSolrResourceLoader(lines));
-    String out = tsToString(factory.create(new IterTokenStream(test)));
-    assertEquals(gold.toString().trim(), out);
+    Tokenizer tokenizer = new WhitespaceTokenizer(
+        new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
+    TokenStream stream = factory.create(tokenizer);
+    assertTokenStreamContents(stream, gold);
   }
 
   class LinesMockSolrResourceLoader implements ResourceLoader {
@@ -93,5 +108,22 @@
       return null;
     }
   }
+  
+  /**
+   * Test the protected words mechanism of SnowballPorterFilterFactory
+   */
+  public void testProtected() throws Exception {
+    SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory();
+    ResourceLoader loader = solrConfig.getResourceLoader();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("protected", "protwords.txt");
+    args.put("language", "English");
+    factory.init(args);
+    factory.inform(loader);
+    Reader reader = new StringReader("ridding of some stemming");
+    Tokenizer tokenizer = new WhitespaceTokenizer(reader);
+    TokenStream stream = factory.create(tokenizer);
+    assertTokenStreamContents(stream, new String[] { "ridding", "of", "some", "stem" });
+  }
 }
 

Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestArabicFilters.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestArabicFilters.java?rev=892821&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestArabicFilters.java (added)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestArabicFilters.java Mon Dec 21 13:53:50 2009
@@ -0,0 +1,65 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+
+/**
+ * Simple tests to ensure the Arabic filter Factories are working.
+ */
+public class TestArabicFilters extends BaseTokenTestCase {
+  /**
+   * Test ArabicLetterTokenizerFactory
+   */
+  public void testTokenizer() throws Exception {
+    Reader reader = new StringReader("الذين مَلكت أيمانكم");
+    ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory();
+    Tokenizer stream = factory.create(reader);
+    assertTokenStreamContents(stream, new String[] {"الذين", "مَلكت", "أيمانكم"});
+  }
+  
+  /**
+   * Test ArabicNormalizationFilterFactory
+   */
+  public void testNormalizer() throws Exception {
+    Reader reader = new StringReader("الذين مَلكت أيمانكم");
+    ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory();
+    ArabicNormalizationFilterFactory filterFactory = new ArabicNormalizationFilterFactory();
+    Tokenizer tokenizer = factory.create(reader);
+    TokenStream stream = filterFactory.create(tokenizer);
+    assertTokenStreamContents(stream, new String[] {"الذين", "ملكت", "ايمانكم"});
+  }
+  
+  /**
+   * Test ArabicStemFilterFactory
+   */
+  public void testStemmer() throws Exception {
+    Reader reader = new StringReader("الذين مَلكت أيمانكم");
+    ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory();
+    ArabicNormalizationFilterFactory normFactory = new ArabicNormalizationFilterFactory();
+    ArabicStemFilterFactory stemFactory = new ArabicStemFilterFactory();
+    Tokenizer tokenizer = factory.create(reader);
+    TokenStream stream = normFactory.create(tokenizer);
+    stream = stemFactory.create(stream);
+    assertTokenStreamContents(stream, new String[] {"ذين", "ملكت", "ايمانكم"});
+  }
+}

Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestBrazilianStemFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestBrazilianStemFilterFactory.java?rev=892821&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestBrazilianStemFilterFactory.java (added)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestBrazilianStemFilterFactory.java Mon Dec 21 13:53:50 2009
@@ -0,0 +1,41 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+
+/**
+ * Simple tests to ensure the Brazilian stem filter factory is working.
+ */
+public class TestBrazilianStemFilterFactory extends BaseTokenTestCase {
+  /**
+   * Ensure the filter actually stems and normalizes text.
+   */
+  public void testStemming() throws Exception {
+    Reader reader = new StringReader("Brasília");
+    Tokenizer tokenizer = new WhitespaceTokenizer(reader);
+    BrazilianStemFilterFactory factory = new BrazilianStemFilterFactory();
+    TokenStream stream = factory.create(tokenizer);
+    assertTokenStreamContents(stream, new String[] { "brasil" });
+  }
+}

Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestBufferedTokenStream.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestBufferedTokenStream.java?rev=892821&r1=892820&r2=892821&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestBufferedTokenStream.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestBufferedTokenStream.java Mon Dec 21 13:53:50 2009
@@ -60,9 +60,7 @@
     final String expected = "How now Q B brown A cow B like Q B thing?";
     TokenStream ts = new AB_Q_Stream
       (new WhitespaceTokenizer(new StringReader(input)));
-    final String actual = tsToString(ts);
-    //System.out.println(actual);
-    assertEquals(expected, actual);
+    assertTokenStreamContents(ts, expected.split("\\s"));
   }
   
   public void testABAAB() throws Exception {
@@ -70,9 +68,7 @@
     final String expected = "How now A A B brown A cow B like A A B thing?";
     TokenStream ts = new AB_AAB_Stream
       (new WhitespaceTokenizer(new StringReader(input)));
-    final String actual = tsToString(ts);
-    //System.out.println(actual);
-    assertEquals(expected, actual);
+    assertTokenStreamContents(ts, expected.split("\\s"));
   }
   
   public void testReset() throws Exception {

Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestCJKTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestCJKTokenizerFactory.java?rev=892821&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestCJKTokenizerFactory.java (added)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestCJKTokenizerFactory.java Mon Dec 21 13:53:50 2009
@@ -0,0 +1,38 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Simple tests to ensure the CJK tokenizer factory is working.
+ */
+public class TestCJKTokenizerFactory extends BaseTokenTestCase {
+  /**
+   * Ensure the tokenizer actually tokenizes CJK text correctly
+   */
+  public void testTokenizer() throws Exception {
+    Reader reader = new StringReader("我是中国人");
+    CJKTokenizerFactory factory = new CJKTokenizerFactory();
+    TokenStream stream = factory.create(reader);
+    assertTokenStreamContents(stream, new String[] {"我是", "是中", "中国", "国人"});
+  }
+}

Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestCapitalizationFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestCapitalizationFilter.java?rev=892821&r1=892820&r2=892821&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestCapitalizationFilter.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestCapitalizationFilter.java Mon Dec 21 13:53:50 2009
@@ -17,14 +17,18 @@
 
 package org.apache.solr.analysis;
 
-import junit.framework.TestCase;
-
+import java.io.StringReader;
 import java.util.HashMap;
 import java.util.Map;
 
+import org.apache.lucene.analysis.KeywordTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+
 
 /**
- * @version $Id$
+ * 
  */
 public class TestCapitalizationFilter extends BaseTokenTestCase {
   
@@ -64,39 +68,46 @@
     factory.processWord(termBuffer, 0, termBuffer.length, 0 );
     assertEquals( "BIG",  new String(termBuffer, 0, termBuffer.length));
     
-    String out = tsToString( factory.create( new IterTokenStream( "Hello thEre my Name is Ryan" ) ) );
-    assertEquals( "Hello there my name is ryan", out );
+    Tokenizer tokenizer = new KeywordTokenizer(new StringReader("Hello thEre my Name is Ryan"));
+    TokenStream stream = factory.create(tokenizer);
+    assertTokenStreamContents(stream, new String[] { "Hello there my name is ryan" });
     
     // now each token
     factory.onlyFirstWord = false;
-    out = tsToString( factory.create( new IterTokenStream( "Hello thEre my Name is Ryan" ) ) );
-    assertEquals( "Hello There My Name Is Ryan", out );
+    tokenizer = new WhitespaceTokenizer(new StringReader("Hello thEre my Name is Ryan"));
+    stream = factory.create(tokenizer);
+    assertTokenStreamContents(stream, new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" });
     
     // now only the long words
     factory.minWordLength = 3;
-    out = tsToString( factory.create( new IterTokenStream( "Hello thEre my Name is Ryan" ) ) );
-    assertEquals( "Hello There my Name is Ryan", out );
+    tokenizer = new WhitespaceTokenizer(new StringReader("Hello thEre my Name is Ryan" ));
+    stream = factory.create(tokenizer);
+    assertTokenStreamContents(stream, new String[] { "Hello", "There", "my", "Name", "is", "Ryan" });
     
     // without prefix
-    out = tsToString( factory.create( new IterTokenStream( "McKinley" ) ) );
-    assertEquals( "Mckinley", out );
+    tokenizer = new WhitespaceTokenizer(new StringReader("McKinley" ));
+    stream = factory.create(tokenizer);
+    assertTokenStreamContents(stream, new String[] { "Mckinley" });
     
     // Now try some prefixes
     factory = new CapitalizationFilterFactory();
     args.put( "okPrefix", "McK" );  // all words
     factory.init( args );
-    out = tsToString( factory.create( new IterTokenStream( "McKinley" ) ) );
-    assertEquals( "McKinley", out );
+    tokenizer = new WhitespaceTokenizer(new StringReader("McKinley" ));
+    stream = factory.create(tokenizer);
+    assertTokenStreamContents(stream, new String[] { "McKinley" });
     
     // now try some stuff with numbers
     factory.forceFirstLetter = false;
     factory.onlyFirstWord = false;
-    out = tsToString( factory.create( new IterTokenStream( "1st 2nd third" ) ) );
-    assertEquals( "1st 2nd Third", out );
-    
-    factory.forceFirstLetter = true;
-    out = tsToString( factory.create( new IterTokenStream( "the The the" ) ) );
-    assertEquals( "The The the", out );
+    tokenizer = new WhitespaceTokenizer(new StringReader("1st 2nd third" ));
+    stream = factory.create(tokenizer);
+    assertTokenStreamContents(stream, new String[] { "1st", "2nd", "Third" });
+    
+    factory.forceFirstLetter = true;  
+    tokenizer = new KeywordTokenizer(new StringReader("the The the" ));
+    stream = factory.create(tokenizer);
+    assertTokenStreamContents(stream, new String[] { "The The the" });
   }
 
   public void testKeepIgnoreCase() throws Exception {
@@ -123,4 +134,80 @@
     factory.processWord(termBuffer, 0, termBuffer.length, 0 );
     assertEquals( "Kitten",  new String(termBuffer, 0, termBuffer.length));
   }
+  
+  /**
+   * Test CapitalizationFilterFactory's minWordLength option.
+   * 
+   * This is very weird when combined with ONLY_FIRST_WORD!!!
+   */
+  public void testMinWordLength() throws Exception {
+    Map<String,String> args = new HashMap<String,String>();
+    args.put(CapitalizationFilterFactory.ONLY_FIRST_WORD, "true");
+    args.put(CapitalizationFilterFactory.MIN_WORD_LENGTH, "5");
+    CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
+    factory.init(args);
+    Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(
+        "helo testing"));
+    TokenStream ts = factory.create(tokenizer);
+    assertTokenStreamContents(ts, new String[] {"helo", "Testing"});
+  }
+  
+  /**
+   * Test CapitalizationFilterFactory's maxWordCount option with only words of 1
+   * in each token (it should do nothing)
+   */
+  public void testMaxWordCount() throws Exception {
+    Map<String,String> args = new HashMap<String,String>();
+    args.put(CapitalizationFilterFactory.MAX_WORD_COUNT, "2");
+    CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
+    factory.init(args);
+    Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(
+        "one two three four"));
+    TokenStream ts = factory.create(tokenizer);
+    assertTokenStreamContents(ts, new String[] {"One", "Two", "Three", "Four"});
+  }
+  
+  /**
+   * Test CapitalizationFilterFactory's maxWordCount option when exceeded
+   */
+  public void testMaxWordCount2() throws Exception {
+    Map<String,String> args = new HashMap<String,String>();
+    args.put(CapitalizationFilterFactory.MAX_WORD_COUNT, "2");
+    CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
+    factory.init(args);
+    Tokenizer tokenizer = new KeywordTokenizer(new StringReader(
+        "one two three four"));
+    TokenStream ts = factory.create(tokenizer);
+    assertTokenStreamContents(ts, new String[] {"one two three four"});
+  }
+  
+  /**
+   * Test CapitalizationFilterFactory's maxTokenLength option when exceeded
+   * 
+   * This is weird, it is not really a max, but inclusive (look at 'is')
+   */
+  public void testMaxTokenLength() throws Exception {
+    Map<String,String> args = new HashMap<String,String>();
+    args.put(CapitalizationFilterFactory.MAX_TOKEN_LENGTH, "2");
+    CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
+    factory.init(args);
+    Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(
+        "this is a test"));
+    TokenStream ts = factory.create(tokenizer);
+    assertTokenStreamContents(ts, new String[] {"this", "is", "A", "test"});
+  }
+  
+  /**
+   * Test CapitalizationFilterFactory's forceFirstLetter option
+   */
+  public void testForceFirstLetter() throws Exception {
+    Map<String,String> args = new HashMap<String,String>();
+    args.put(CapitalizationFilterFactory.KEEP, "kitten");
+    args.put(CapitalizationFilterFactory.FORCE_FIRST_LETTER, "true");
+    CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
+    factory.init(args);
+    Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader("kitten"));
+    TokenStream ts = factory.create(tokenizer);
+    assertTokenStreamContents(ts, new String[] {"Kitten"});
+  }
 }

Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestChineseFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestChineseFilterFactory.java?rev=892821&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestChineseFilterFactory.java (added)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestChineseFilterFactory.java Mon Dec 21 13:53:50 2009
@@ -0,0 +1,41 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+
+/**
+ * Simple tests to ensure the Chinese filter factory is working.
+ */
+public class TestChineseFilterFactory extends BaseTokenTestCase {
+  /**
+   * Ensure the filter actually normalizes text (numerics, stopwords)
+   */
+  public void testFiltering() throws Exception {
+    Reader reader = new StringReader("this 1234 Is such a silly filter");
+    Tokenizer tokenizer = new WhitespaceTokenizer(reader);
+    ChineseFilterFactory factory = new ChineseFilterFactory();
+    TokenStream stream = factory.create(tokenizer);
+    assertTokenStreamContents(stream, new String[] { "Is", "silly", "filter" });
+  }
+}

Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestChineseTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestChineseTokenizerFactory.java?rev=892821&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestChineseTokenizerFactory.java (added)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestChineseTokenizerFactory.java Mon Dec 21 13:53:50 2009
@@ -0,0 +1,38 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Simple tests to ensure the Chinese tokenizer factory is working.
+ */
+public class TestChineseTokenizerFactory extends BaseTokenTestCase {
+  /**
+   * Ensure the tokenizer actually tokenizes chinese text correctly
+   */
+  public void testTokenizer() throws Exception {
+    Reader reader = new StringReader("我是中国人");
+    ChineseTokenizerFactory factory = new ChineseTokenizerFactory();
+    TokenStream stream = factory.create(reader);
+    assertTokenStreamContents(stream, new String[] {"我", "是", "中", "国", "人"});
+  }
+}

Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestCollationKeyFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestCollationKeyFilterFactory.java?rev=892821&r1=892820&r2=892821&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestCollationKeyFilterFactory.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestCollationKeyFilterFactory.java Mon Dec 21 13:53:50 2009
@@ -20,6 +20,7 @@
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.StringReader;
 import java.text.Collator;
 import java.text.RuleBasedCollator;
 import java.util.HashMap;
@@ -27,7 +28,9 @@
 import java.util.Locale;
 import java.util.Map;
 
+import org.apache.lucene.analysis.KeywordTokenizer;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.solr.common.ResourceLoader;
 
 public class TestCollationKeyFilterFactory extends BaseTokenTestCase {
@@ -39,18 +42,80 @@
    * Then things will sort and match correctly.
    */
   public void testBasicUsage() throws IOException {
-    String[] turkishUpperCase = { "I", "WÄ°LL", "USE", "TURKÄ°SH", "CASING" };
-    String[] turkishLowerCase = { "ı", "will", "use", "turkish", "casıng" };
+    String turkishUpperCase = "I WÄ°LL USE TURKÄ°SH CASING";
+    String turkishLowerCase = "ı will use turkish casıng";
     CollationKeyFilterFactory factory = new CollationKeyFilterFactory();
     Map<String,String> args = new HashMap<String,String>();
     args.put("language", "tr");
     args.put("strength", "primary");
     factory.init(args);
     factory.inform(new StringMockSolrResourceLoader(""));
-    TokenStream tsUpper = factory.create(new IterTokenStream(turkishUpperCase));
-    TokenStream tsLower = factory.create(new IterTokenStream(turkishLowerCase));
-    assertTokEqual(BaseTokenTestCase.getTokens(tsUpper),
-        BaseTokenTestCase.getTokens(tsLower));
+    TokenStream tsUpper = factory.create(
+        new KeywordTokenizer(new StringReader(turkishUpperCase)));
+    TokenStream tsLower = factory.create(
+        new KeywordTokenizer(new StringReader(turkishLowerCase)));
+    assertCollatesToSame(tsUpper, tsLower);
+  }
+  
+  /*
+   * Test usage of the decomposition option for unicode normalization.
+   */
+  public void testNormalization() throws IOException {
+    String turkishUpperCase = "I W\u0049\u0307LL USE TURKÄ°SH CASING";
+    String turkishLowerCase = "ı will use turkish casıng";
+    CollationKeyFilterFactory factory = new CollationKeyFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("language", "tr");
+    args.put("strength", "primary");
+    args.put("decomposition", "canonical");
+    factory.init(args);
+    factory.inform(new StringMockSolrResourceLoader(""));
+    TokenStream tsUpper = factory.create(
+        new KeywordTokenizer(new StringReader(turkishUpperCase)));
+    TokenStream tsLower = factory.create(
+        new KeywordTokenizer(new StringReader(turkishLowerCase)));
+    assertCollatesToSame(tsUpper, tsLower);
+  }
+  
+  /*
+   * Test usage of the K decomposition option for unicode normalization.
+   * This works even with identical strength.
+   */
+  public void testFullDecomposition() throws IOException {
+    String fullWidth = "Testing";
+    String halfWidth = "Testing";
+    CollationKeyFilterFactory factory = new CollationKeyFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("language", "zh");
+    args.put("strength", "identical");
+    args.put("decomposition", "full");
+    factory.init(args);
+    factory.inform(new StringMockSolrResourceLoader(""));
+    TokenStream tsFull = factory.create(
+        new KeywordTokenizer(new StringReader(fullWidth)));
+    TokenStream tsHalf = factory.create(
+        new KeywordTokenizer(new StringReader(halfWidth)));
+    assertCollatesToSame(tsFull, tsHalf);
+  }
+  
+  /*
+   * Test secondary strength, for english case is not significant.
+   */
+  public void testSecondaryStrength() throws IOException {
+    String upperCase = "TESTING";
+    String lowerCase = "testing";
+    CollationKeyFilterFactory factory = new CollationKeyFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("language", "en");
+    args.put("strength", "secondary");
+    args.put("decomposition", "no");
+    factory.init(args);
+    factory.inform(new StringMockSolrResourceLoader(""));
+    TokenStream tsUpper = factory.create(
+        new KeywordTokenizer(new StringReader(upperCase)));
+    TokenStream tsLower = factory.create(
+        new KeywordTokenizer(new StringReader(lowerCase)));
+    assertCollatesToSame(tsUpper, tsLower);
   }
 
   /*
@@ -74,20 +139,22 @@
     // at this point, you would save these tailoredRules to a file, 
     // and use the custom parameter.
     //
-    String[] germanUmlaut = { "Töne" };
-    String[] germanOE = { "Toene" };
+    String germanUmlaut = "Töne";
+    String germanOE = "Toene";
     CollationKeyFilterFactory factory = new CollationKeyFilterFactory();
     Map<String,String> args = new HashMap<String,String>();
     args.put("custom", "rules.txt");
     args.put("strength", "primary");
     factory.init(args);
     factory.inform(new StringMockSolrResourceLoader(tailoredRules));
-    TokenStream tsUmlaut = factory.create(new IterTokenStream(germanUmlaut));
-    TokenStream tsOE = factory.create(new IterTokenStream(germanOE));
-    assertTokEqual(BaseTokenTestCase.getTokens(tsUmlaut),
-        BaseTokenTestCase.getTokens(tsOE));
-  }
+    TokenStream tsUmlaut = factory.create(
+        new KeywordTokenizer(new StringReader(germanUmlaut)));
+    TokenStream tsOE = factory.create(
+        new KeywordTokenizer(new StringReader(germanOE)));
 
+    assertCollatesToSame(tsUmlaut, tsOE);
+  }
+  
   private class StringMockSolrResourceLoader implements ResourceLoader {
     String text;
 
@@ -107,4 +174,17 @@
       return new ByteArrayInputStream(text.getBytes("UTF-8"));
     }
   }
+  
+  private void assertCollatesToSame(TokenStream stream1, TokenStream stream2)
+      throws IOException {
+    TermAttribute term1 = (TermAttribute) stream1
+        .addAttribute(TermAttribute.class);
+    TermAttribute term2 = (TermAttribute) stream2
+        .addAttribute(TermAttribute.class);
+    assertTrue(stream1.incrementToken());
+    assertTrue(stream2.incrementToken());
+    assertEquals(term1.term(), term2.term());
+    assertFalse(stream1.incrementToken());
+    assertFalse(stream2.incrementToken());
+  }
 }

Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestDictionaryCompoundWordTokenFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestDictionaryCompoundWordTokenFilterFactory.java?rev=892821&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestDictionaryCompoundWordTokenFilterFactory.java (added)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestDictionaryCompoundWordTokenFilterFactory.java Mon Dec 21 13:53:50 2009
@@ -0,0 +1,51 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.solr.common.ResourceLoader;
+
+/**
+ * Simple tests to ensure the Dictionary compound filter factory is working.
+ */
+public class TestDictionaryCompoundWordTokenFilterFactory extends BaseTokenTestCase {
+  /**
+   * Ensure the filter actually decompounds text.
+   */
+  public void testDecompounding() throws Exception {
+    Reader reader = new StringReader("I like to play softball");
+    Tokenizer tokenizer = new WhitespaceTokenizer(reader);
+    DictionaryCompoundWordTokenFilterFactory factory = new DictionaryCompoundWordTokenFilterFactory();
+    ResourceLoader loader = solrConfig.getResourceLoader();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("dictionary", "compoundDictionary.txt");
+    factory.init(args);
+    factory.inform(loader);
+    TokenStream stream = factory.create(tokenizer);
+    assertTokenStreamContents(stream, 
+        new String[] { "I", "like", "to", "play", "softball", "soft", "ball" });
+  }
+  
+}

Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestDutchStemFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestDutchStemFilterFactory.java?rev=892821&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestDutchStemFilterFactory.java (added)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestDutchStemFilterFactory.java Mon Dec 21 13:53:50 2009
@@ -0,0 +1,41 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+
+/**
+ * Simple tests to ensure the Dutch stem filter factory is working.
+ */
+public class TestDutchStemFilterFactory extends BaseTokenTestCase {
+  /**
+   * Ensure the filter actually stems text.
+   */
+  public void testStemming() throws Exception {
+    Reader reader = new StringReader("lichamelijkheden");
+    Tokenizer tokenizer = new WhitespaceTokenizer(reader);
+    DutchStemFilterFactory factory = new DutchStemFilterFactory();
+    TokenStream stream = factory.create(tokenizer);
+    assertTokenStreamContents(stream, new String[] { "licham" });
+  }
+}



Mime
View raw message