lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r916666 [5/16] - in /lucene/java/branches/flex_1458: ./ contrib/ contrib/analyzers/common/ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/ contrib/analyzers/c...
Date Fri, 26 Feb 2010 13:10:08 GMT
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java Fri Feb 26 13:09:54 2010
@@ -26,6 +26,7 @@
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.lucene.analysis.tokenattributes.*;
+import org.apache.lucene.util.Version;
 
 public class ShingleFilterTest extends BaseTokenStreamTestCase {
 
@@ -288,7 +289,360 @@
 
   };
 
+  public static final Token[] TRI_GRAM_TOKENS_MIN_TRI_GRAM = new Token[] {
+    createToken("please", 0, 6),
+    createToken("please divide this", 0, 18),
+    createToken("divide", 7, 13),
+    createToken("divide this sentence", 7, 27),
+    createToken("this", 14, 18),
+    createToken("this sentence into", 14, 32),
+    createToken("sentence", 19, 27),
+    createToken("sentence into shingles", 19, 39),
+    createToken("into", 28, 32),
+    createToken("shingles", 33, 39)
+  };
+
+  public static final int[] TRI_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM = new int[] {
+    1, 0, 1, 0, 1, 0, 1, 0, 1, 1
+  };
+
+  public static final String[] TRI_GRAM_TYPES_MIN_TRI_GRAM = new String[] {
+    "word", "shingle",
+    "word", "shingle",
+    "word", "shingle",
+    "word", "shingle",
+    "word",
+    "word"
+  };
+  
+  public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new Token[] {
+    createToken("please divide this", 0, 18),
+    createToken("divide this sentence", 7, 27),
+    createToken("this sentence into", 14, 32),
+    createToken("sentence into shingles", 19, 39)
+  };
+
+  public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new int[] {
+    1, 1, 1, 1
+  };
+  
+  public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new String[] {
+    "shingle",
+    "shingle",
+    "shingle",
+    "shingle"
+  };
+  
+  public static final Token[] FOUR_GRAM_TOKENS_MIN_TRI_GRAM = new Token[] {
+    createToken("please", 0, 6),
+    createToken("please divide this", 0, 18),
+    createToken("please divide this sentence", 0, 27),
+    createToken("divide", 7, 13),
+    createToken("divide this sentence", 7, 27),
+    createToken("divide this sentence into", 7, 32),
+    createToken("this", 14, 18),
+    createToken("this sentence into", 14, 32),
+    createToken("this sentence into shingles", 14, 39),
+    createToken("sentence", 19, 27),
+    createToken("sentence into shingles", 19, 39),
+    createToken("into", 28, 32),
+    createToken("shingles", 33, 39)
+  };
+
+  public static final int[] FOUR_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM = new int[] {
+    1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1
+  };
+
+  public static final String[] FOUR_GRAM_TYPES_MIN_TRI_GRAM = new String[] {
+    "word", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "word", "shingle",
+    "word",
+    "word"
+  };
+  
+  public static final Token[] FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new Token[] {
+    createToken("please divide this", 0, 18),
+    createToken("please divide this sentence", 0, 27),
+    createToken("divide this sentence", 7, 27),
+    createToken("divide this sentence into", 7, 32),
+    createToken("this sentence into", 14, 32),
+    createToken("this sentence into shingles", 14, 39),
+    createToken("sentence into shingles", 19, 39),
+  };
+
+  public static final int[] FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new int[] {
+    1, 0, 1, 0, 1, 0, 1
+  };
+  
+  public static final String[] FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new String[] {
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle"
+  };
+  
+  public static final Token[] FOUR_GRAM_TOKENS_MIN_FOUR_GRAM = new Token[] {
+    createToken("please", 0, 6),
+    createToken("please divide this sentence", 0, 27),
+    createToken("divide", 7, 13),
+    createToken("divide this sentence into", 7, 32),
+    createToken("this", 14, 18),
+    createToken("this sentence into shingles", 14, 39),
+    createToken("sentence", 19, 27),
+    createToken("into", 28, 32),
+    createToken("shingles", 33, 39)
+  };
+
+  public static final int[] FOUR_GRAM_POSITION_INCREMENTS_MIN_FOUR_GRAM = new int[] {
+    1, 0, 1, 0, 1, 0, 1, 1, 1
+  };
+
+  public static final String[] FOUR_GRAM_TYPES_MIN_FOUR_GRAM = new String[] {
+    "word", "shingle",
+    "word", "shingle",
+    "word", "shingle",
+    "word",
+    "word",
+    "word"
+  };
+  
+  public static final Token[] FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM = new Token[] {
+    createToken("please divide this sentence", 0, 27),
+    createToken("divide this sentence into", 7, 32),
+    createToken("this sentence into shingles", 14, 39),
+  };
+
+  public static final int[] FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM = new int[] {
+    1, 1, 1
+  };
+  
+  public static final String[] FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM = new String[] {
+    "shingle",
+    "shingle",
+    "shingle"
+  };
+
+  public static final Token[] BI_GRAM_TOKENS_NO_SEPARATOR = new Token[] {
+    createToken("please", 0, 6),
+    createToken("pleasedivide", 0, 13),
+    createToken("divide", 7, 13),
+    createToken("dividethis", 7, 18),
+    createToken("this", 14, 18),
+    createToken("thissentence", 14, 27),
+    createToken("sentence", 19, 27),
+    createToken("sentenceinto", 19, 32),
+    createToken("into", 28, 32),
+    createToken("intoshingles", 28, 39),
+    createToken("shingles", 33, 39),
+  };
+
+  public static final int[] BI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR = new int[] {
+    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+  };
+
+  public static final String[] BI_GRAM_TYPES_NO_SEPARATOR = new String[] {
+    "word", "shingle", "word", "shingle", "word", "shingle", "word",
+    "shingle", "word", "shingle", "word"
+  };
+
+  public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new Token[] {
+    createToken("pleasedivide", 0, 13),
+    createToken("dividethis", 7, 18),
+    createToken("thissentence", 14, 27),
+    createToken("sentenceinto", 19, 32),
+    createToken("intoshingles", 28, 39),
+  };
+
+  public static final int[] BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new int[] {
+    1, 1, 1, 1, 1
+  };
+
+  public static final String[] BI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR = new String[] {
+    "shingle", "shingle", "shingle", "shingle", "shingle"
+  };
+  
+  public static final Token[] TRI_GRAM_TOKENS_NO_SEPARATOR = new Token[] {
+    createToken("please", 0, 6),
+    createToken("pleasedivide", 0, 13),
+    createToken("pleasedividethis", 0, 18),
+    createToken("divide", 7, 13),
+    createToken("dividethis", 7, 18),
+    createToken("dividethissentence", 7, 27),
+    createToken("this", 14, 18),
+    createToken("thissentence", 14, 27),
+    createToken("thissentenceinto", 14, 32),
+    createToken("sentence", 19, 27),
+    createToken("sentenceinto", 19, 32),
+    createToken("sentenceintoshingles", 19, 39),
+    createToken("into", 28, 32),
+    createToken("intoshingles", 28, 39),
+    createToken("shingles", 33, 39)
+  };
+
+  public static final int[] TRI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR = new int[] {
+    1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
+  };
+
+  public static final String[] TRI_GRAM_TYPES_NO_SEPARATOR = new String[] {
+    "word", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "word", "shingle",
+    "word"
+  };
+  
+  public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new Token[] {
+    createToken("pleasedivide", 0, 13),
+    createToken("pleasedividethis", 0, 18),
+    createToken("dividethis", 7, 18),
+    createToken("dividethissentence", 7, 27),
+    createToken("thissentence", 14, 27),
+    createToken("thissentenceinto", 14, 32),
+    createToken("sentenceinto", 19, 32),
+    createToken("sentenceintoshingles", 19, 39),
+    createToken("intoshingles", 28, 39),
+  };
+
+  public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new int[] {
+    1, 0, 1, 0, 1, 0, 1, 0, 1
+  };
+  
+  public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR = new String[] {
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle",
+  };
+
+  public static final Token[] BI_GRAM_TOKENS_ALT_SEPARATOR = new Token[] {
+    createToken("please", 0, 6),
+    createToken("please<SEP>divide", 0, 13),
+    createToken("divide", 7, 13),
+    createToken("divide<SEP>this", 7, 18),
+    createToken("this", 14, 18),
+    createToken("this<SEP>sentence", 14, 27),
+    createToken("sentence", 19, 27),
+    createToken("sentence<SEP>into", 19, 32),
+    createToken("into", 28, 32),
+    createToken("into<SEP>shingles", 28, 39),
+    createToken("shingles", 33, 39),
+  };
 
+  public static final int[] BI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR = new int[] {
+    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+  };
+
+  public static final String[] BI_GRAM_TYPES_ALT_SEPARATOR = new String[] {
+    "word", "shingle", "word", "shingle", "word", "shingle", "word",
+    "shingle", "word", "shingle", "word"
+  };
+
+  public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new Token[] {
+    createToken("please<SEP>divide", 0, 13),
+    createToken("divide<SEP>this", 7, 18),
+    createToken("this<SEP>sentence", 14, 27),
+    createToken("sentence<SEP>into", 19, 32),
+    createToken("into<SEP>shingles", 28, 39),
+  };
+
+  public static final int[] BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new int[] {
+    1, 1, 1, 1, 1
+  };
+
+  public static final String[] BI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new String[] {
+    "shingle", "shingle", "shingle", "shingle", "shingle"
+  };
+  
+  public static final Token[] TRI_GRAM_TOKENS_ALT_SEPARATOR = new Token[] {
+    createToken("please", 0, 6),
+    createToken("please<SEP>divide", 0, 13),
+    createToken("please<SEP>divide<SEP>this", 0, 18),
+    createToken("divide", 7, 13),
+    createToken("divide<SEP>this", 7, 18),
+    createToken("divide<SEP>this<SEP>sentence", 7, 27),
+    createToken("this", 14, 18),
+    createToken("this<SEP>sentence", 14, 27),
+    createToken("this<SEP>sentence<SEP>into", 14, 32),
+    createToken("sentence", 19, 27),
+    createToken("sentence<SEP>into", 19, 32),
+    createToken("sentence<SEP>into<SEP>shingles", 19, 39),
+    createToken("into", 28, 32),
+    createToken("into<SEP>shingles", 28, 39),
+    createToken("shingles", 33, 39)
+  };
+
+  public static final int[] TRI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR = new int[] {
+    1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
+  };
+
+  public static final String[] TRI_GRAM_TYPES_ALT_SEPARATOR = new String[] {
+    "word", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "word", "shingle",
+    "word"
+  };
+  
+  public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new Token[] {
+    createToken("please<SEP>divide", 0, 13),
+    createToken("please<SEP>divide<SEP>this", 0, 18),
+    createToken("divide<SEP>this", 7, 18),
+    createToken("divide<SEP>this<SEP>sentence", 7, 27),
+    createToken("this<SEP>sentence", 14, 27),
+    createToken("this<SEP>sentence<SEP>into", 14, 32),
+    createToken("sentence<SEP>into", 19, 32),
+    createToken("sentence<SEP>into<SEP>shingles", 19, 39),
+    createToken("into<SEP>shingles", 28, 39),
+  };
+
+  public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new int[] {
+    1, 0, 1, 0, 1, 0, 1, 0, 1
+  };
+  
+  public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new String[] {
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle",
+  };
+
+  public static final Token[] TRI_GRAM_TOKENS_NULL_SEPARATOR = new Token[] {
+    createToken("please", 0, 6),
+    createToken("pleasedivide", 0, 13),
+    createToken("pleasedividethis", 0, 18),
+    createToken("divide", 7, 13),
+    createToken("dividethis", 7, 18),
+    createToken("dividethissentence", 7, 27),
+    createToken("this", 14, 18),
+    createToken("thissentence", 14, 27),
+    createToken("thissentenceinto", 14, 32),
+    createToken("sentence", 19, 27),
+    createToken("sentenceinto", 19, 32),
+    createToken("sentenceintoshingles", 19, 39),
+    createToken("into", 28, 32),
+    createToken("intoshingles", 28, 39),
+    createToken("shingles", 33, 39)
+  };
+
+  public static final int[] TRI_GRAM_POSITION_INCREMENTS_NULL_SEPARATOR = new int[] {
+    1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
+  };
+
+  public static final String[] TRI_GRAM_TYPES_NULL_SEPARATOR = new String[] {
+    "word", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "word", "shingle",
+    "word"
+  };
+  
   @Override
   protected void setUp() throws Exception {
     super.setUp();
@@ -379,8 +733,110 @@
   }
   
   
+  public void testTriGramFilterMinTriGram() throws IOException {
+    this.shingleFilterTest(3, 3, TEST_TOKEN, TRI_GRAM_TOKENS_MIN_TRI_GRAM,
+                           TRI_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM,
+                           TRI_GRAM_TYPES_MIN_TRI_GRAM,
+                           true);
+  }
+  
+  public void testTriGramFilterWithoutUnigramsMinTriGram() throws IOException {
+    this.shingleFilterTest(3, 3, TEST_TOKEN, 
+                           TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
+                           TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM, 
+                           TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
+                           false);
+  }
+  
+  public void testFourGramFilterMinTriGram() throws IOException {
+    this.shingleFilterTest(3, 4, TEST_TOKEN, FOUR_GRAM_TOKENS_MIN_TRI_GRAM,
+                           FOUR_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM, 
+                           FOUR_GRAM_TYPES_MIN_TRI_GRAM,
+                           true);
+  }
+  
+  public void testFourGramFilterWithoutUnigramsMinTriGram() throws IOException {
+    this.shingleFilterTest(3, 4, TEST_TOKEN, 
+                           FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
+                           FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
+                           FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM, false);
+  }
+
+  public void testFourGramFilterMinFourGram() throws IOException {
+    this.shingleFilterTest(4, 4, TEST_TOKEN, FOUR_GRAM_TOKENS_MIN_FOUR_GRAM,
+                           FOUR_GRAM_POSITION_INCREMENTS_MIN_FOUR_GRAM, 
+                           FOUR_GRAM_TYPES_MIN_FOUR_GRAM,
+                           true);
+  }
+  
+  public void testFourGramFilterWithoutUnigramsMinFourGram() throws IOException {
+    this.shingleFilterTest(4, 4, TEST_TOKEN, 
+                           FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM,
+                           FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM,
+                           FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM, false);
+  }
+ 
+  public void testBiGramFilterNoSeparator() throws IOException {
+    this.shingleFilterTest("", 2, 2, TEST_TOKEN, BI_GRAM_TOKENS_NO_SEPARATOR,
+                           BI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR, 
+                           BI_GRAM_TYPES_NO_SEPARATOR, true);
+  }
+
+  public void testBiGramFilterWithoutUnigramsNoSeparator() throws IOException {
+    this.shingleFilterTest("", 2, 2, TEST_TOKEN, 
+                           BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR,
+                           BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR, 
+                           BI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR,
+                           false);
+  }
+  public void testTriGramFilterNoSeparator() throws IOException {
+    this.shingleFilterTest("", 2, 3, TEST_TOKEN, TRI_GRAM_TOKENS_NO_SEPARATOR,
+                           TRI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR, 
+                           TRI_GRAM_TYPES_NO_SEPARATOR, true);
+  }
+  
+  public void testTriGramFilterWithoutUnigramsNoSeparator() throws IOException {
+    this.shingleFilterTest("", 2, 3, TEST_TOKEN, 
+                           TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR,
+                           TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR,
+                           TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR, false);
+  }
+  
+  public void testBiGramFilterAltSeparator() throws IOException {
+    this.shingleFilterTest("<SEP>", 2, 2, TEST_TOKEN, BI_GRAM_TOKENS_ALT_SEPARATOR,
+                           BI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR, 
+                           BI_GRAM_TYPES_ALT_SEPARATOR, true);
+  }
+
+  public void testBiGramFilterWithoutUnigramsAltSeparator() throws IOException {
+    this.shingleFilterTest("<SEP>", 2, 2, TEST_TOKEN, 
+                           BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
+                           BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR, 
+                           BI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
+                           false);
+  }
+  public void testTriGramFilterAltSeparator() throws IOException {
+    this.shingleFilterTest("<SEP>", 2, 3, TEST_TOKEN, TRI_GRAM_TOKENS_ALT_SEPARATOR,
+                           TRI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR, 
+                           TRI_GRAM_TYPES_ALT_SEPARATOR, true);
+  }
+  
+  public void testTriGramFilterWithoutUnigramsAltSeparator() throws IOException {
+    this.shingleFilterTest("<SEP>", 2, 3, TEST_TOKEN, 
+                           TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
+                           TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
+                           TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR, false);
+  }
+
+  public void testTriGramFilterNullSeparator() throws IOException {
+    this.shingleFilterTest(null, 2, 3, TEST_TOKEN, TRI_GRAM_TOKENS_NULL_SEPARATOR,
+                           TRI_GRAM_POSITION_INCREMENTS_NULL_SEPARATOR, 
+                           TRI_GRAM_TYPES_NULL_SEPARATOR, true);
+  }
+  
+  
   public void testReset() throws Exception {
-    Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence"));
+    Tokenizer wsTokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("please divide this sentence"));
     TokenStream filter = new ShingleFilter(wsTokenizer, 2);
     assertTokenStreamContents(filter,
       new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"},
@@ -403,30 +859,50 @@
     throws IOException {
 
     ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
+    shingleFilterTestCommon
+      (filter, tokensToCompare, positionIncrements, types, outputUnigrams);
+  }
+
+  protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle, 
+                                   Token[] tokensToCompare, int[] positionIncrements,
+                                   String[] types, boolean outputUnigrams)
+    throws IOException {
+    ShingleFilter filter 
+      = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
+    shingleFilterTestCommon
+      (filter, tokensToCompare, positionIncrements, types, outputUnigrams);
+  }
+
+  protected void shingleFilterTest(String tokenSeparator, int minSize, int maxSize, Token[] tokensToShingle, 
+                                   Token[] tokensToCompare, int[] positionIncrements,
+                                   String[] types, boolean outputUnigrams)
+    throws IOException {
+    ShingleFilter filter 
+      = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
+    filter.setTokenSeparator(tokenSeparator);
+    shingleFilterTestCommon
+      (filter, tokensToCompare, positionIncrements, types, outputUnigrams);
+  }
+
+  protected void shingleFilterTestCommon(ShingleFilter filter,
+                                         Token[] tokensToCompare,
+                                         int[] positionIncrements,
+                                         String[] types, boolean outputUnigrams)
+    throws IOException {
+
     filter.setOutputUnigrams(outputUnigrams);
 
-    TermAttribute termAtt = filter.addAttribute(TermAttribute.class);
-    OffsetAttribute offsetAtt = filter.addAttribute(OffsetAttribute.class);
-    PositionIncrementAttribute posIncrAtt = filter.addAttribute(PositionIncrementAttribute.class);
-    TypeAttribute typeAtt = filter.addAttribute(TypeAttribute.class);
-
-    int i = 0;
-    while (filter.incrementToken()) {
-      assertTrue("ShingleFilter outputted more tokens than expected", i < tokensToCompare.length);
-      String termText = termAtt.term();
-      String goldText = tokensToCompare[i].term();
-      assertEquals("Wrong termText", goldText, termText);
-      assertEquals("Wrong startOffset for token \"" + termText + "\"",
-          tokensToCompare[i].startOffset(), offsetAtt.startOffset());
-      assertEquals("Wrong endOffset for token \"" + termText + "\"",
-          tokensToCompare[i].endOffset(), offsetAtt.endOffset());
-      assertEquals("Wrong positionIncrement for token \"" + termText + "\"",
-          positionIncrements[i], posIncrAtt.getPositionIncrement());
-      assertEquals("Wrong type for token \"" + termText + "\"", types[i], typeAtt.type());
-      i++;
+    String text[] = new String[tokensToCompare.length];
+    int startOffsets[] = new int[tokensToCompare.length];
+    int endOffsets[] = new int[tokensToCompare.length];
+    
+    for (int i = 0; i < tokensToCompare.length; i++) {
+      text[i] = tokensToCompare[i].term();
+      startOffsets[i] = tokensToCompare[i].startOffset();
+      endOffsets[i] = tokensToCompare[i].endOffset();
     }
-    assertEquals("ShingleFilter outputted wrong # of tokens. (# output = " + i + "; # expected =" + tokensToCompare.length + ")",
-                 tokensToCompare.length, i);
+    
+    assertTokenStreamContents(filter, text, startOffsets, endOffsets, types, positionIncrements);
   }
 
   private static Token createToken(String term, int start, int offset)

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java Fri Feb 26 13:09:54 2010
@@ -22,11 +22,8 @@
 import java.util.Collection;
 import java.util.Iterator;
 import java.util.LinkedList;
-import java.util.HashSet;
-import java.util.Arrays;
 
 import org.apache.lucene.analysis.*;
-import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
 import org.apache.lucene.analysis.miscellaneous.PrefixAndSuffixAwareTokenFilter;
 import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream;
@@ -34,6 +31,7 @@
 import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix;
 import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix.Column;
 import org.apache.lucene.analysis.tokenattributes.*;
+import org.apache.lucene.util.Version;
 
 public class TestShingleMatrixFilter extends BaseTokenStreamTestCase {
 
@@ -43,7 +41,7 @@
 
   public void testIterator() throws IOException {
 
-    WhitespaceTokenizer wst = new WhitespaceTokenizer(new StringReader("one two three four five"));
+    WhitespaceTokenizer wst = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("one two three four five"));
     ShingleMatrixFilter smf = new ShingleMatrixFilter(wst, 2, 2, '_', false, new ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec());
 
     int i;
@@ -85,22 +83,12 @@
 
     ts = new ShingleMatrixFilter(tls, 1, 2, new Character(' '), false, new ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec());
 
-
-    assertNext(ts, "please", 0, 6);
-    assertNext(ts, "please divide", 0, 13);
-    assertNext(ts, "divide", 7, 13);
-    assertNext(ts, "divide this", 7, 18);
-    assertNext(ts, "this", 14, 18);
-    assertNext(ts, "this sentence", 14, 27);
-    assertNext(ts, "sentence", 19, 27);
-    assertNext(ts, "sentence into", 19, 32);
-    assertNext(ts, "into", 28, 32);
-    assertNext(ts, "into shingles", 28, 39);
-    assertNext(ts, "shingles", 33, 39);
-
-
-    assertFalse(ts.incrementToken());
-
+    assertTokenStreamContents(ts,
+      new String[] { "please", "please divide", "divide", "divide this",
+        "this", "this sentence", "sentence", "sentence into", "into",
+        "into shingles", "shingles" },
+      new int[] { 0, 0, 7, 7, 14, 14, 19, 19, 28, 28, 33 },
+      new int[] { 6, 13, 13, 18, 18, 27, 27, 32, 32, 39, 39 });
   }
 
   /**
@@ -546,6 +534,7 @@
         return false;
       }
       Token prototype = (Token) iterator.next();
+      clearAttributes();
       termAtt.setTermBuffer(prototype.termBuffer(), 0, prototype.termLength());
       posIncrAtt.setPositionIncrement(prototype.getPositionIncrement());
       flagsAtt.setFlags(prototype.getFlags());

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/sinks/DateRecognizerSinkTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/sinks/DateRecognizerSinkTokenizerTest.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/sinks/DateRecognizerSinkTokenizerTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/sinks/DateRecognizerSinkTokenizerTest.java Fri Feb 26 13:09:54 2010
@@ -25,6 +25,7 @@
 import org.apache.lucene.analysis.TeeSinkTokenFilter;
 import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.lucene.analysis.TeeSinkTokenFilter.SinkTokenStream;
+import org.apache.lucene.util.Version;
 
 public class DateRecognizerSinkTokenizerTest extends BaseTokenStreamTestCase {
 
@@ -36,7 +37,7 @@
   public void test() throws IOException {
     DateRecognizerSinkFilter sinkFilter = new DateRecognizerSinkFilter(new SimpleDateFormat("MM/dd/yyyy", Locale.US));
     String test = "The quick red fox jumped over the lazy brown dogs on 7/11/2006  The dogs finally reacted on 7/12/2006";
-    TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new WhitespaceTokenizer(new StringReader(test)));
+    TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(test)));
     SinkTokenStream sink = tee.newSinkTokenStream(sinkFilter);
     int count = 0;
     

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/sinks/TokenRangeSinkTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/sinks/TokenRangeSinkTokenizerTest.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/sinks/TokenRangeSinkTokenizerTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/sinks/TokenRangeSinkTokenizerTest.java Fri Feb 26 13:09:54 2010
@@ -23,6 +23,7 @@
 import org.apache.lucene.analysis.TeeSinkTokenFilter;
 import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.lucene.analysis.TeeSinkTokenFilter.SinkTokenStream;
+import org.apache.lucene.util.Version;
 
 public class TokenRangeSinkTokenizerTest extends BaseTokenStreamTestCase {
 
@@ -34,7 +35,7 @@
   public void test() throws IOException {
     TokenRangeSinkFilter sinkFilter = new TokenRangeSinkFilter(2, 4);
     String test = "The quick red fox jumped over the lazy brown dogs";
-    TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new WhitespaceTokenizer(new StringReader(test)));
+    TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(test)));
     SinkTokenStream rangeToks = tee.newSinkTokenStream(sinkFilter);
     
     int count = 0;

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizerTest.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizerTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizerTest.java Fri Feb 26 13:09:54 2010
@@ -27,6 +27,7 @@
 import org.apache.lucene.analysis.TeeSinkTokenFilter.SinkTokenStream;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.Version;
 
 public class TokenTypeSinkTokenizerTest extends BaseTokenStreamTestCase {
 
@@ -39,7 +40,7 @@
     TokenTypeSinkFilter sinkFilter = new TokenTypeSinkFilter("D");
     String test = "The quick red fox jumped over the lazy brown dogs";
 
-    TeeSinkTokenFilter ttf = new TeeSinkTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))));
+    TeeSinkTokenFilter ttf = new TeeSinkTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(test))));
     SinkTokenStream sink = ttf.newSinkTokenStream(sinkFilter);
     
     boolean seenDogs = false;

Propchange: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/snowball/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Fri Feb 26 13:09:54 2010
@@ -0,0 +1 @@
+data

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java Fri Feb 26 13:09:54 2010
@@ -22,6 +22,7 @@
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.util.Version;
 
 /**
  * Test the Turkish lowercase filter.
@@ -32,7 +33,7 @@
    * Test composed forms
    */
   public void testTurkishLowerCaseFilter() throws Exception {
-    TokenStream stream = new WhitespaceTokenizer(new StringReader(
+    TokenStream stream = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(
         "\u0130STANBUL \u0130ZM\u0130R ISPARTA"));
     TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream);
     assertTokenStreamContents(filter, new String[] {"istanbul", "izmir",
@@ -43,7 +44,7 @@
    * Test decomposed forms
    */
   public void testDecomposed() throws Exception {
-    TokenStream stream = new WhitespaceTokenizer(new StringReader(
+    TokenStream stream = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(
         "\u0049\u0307STANBUL \u0049\u0307ZM\u0049\u0307R ISPARTA"));
     TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream);
     assertTokenStreamContents(filter, new String[] {"istanbul", "izmir",
@@ -56,7 +57,7 @@
    * to U+0130 + U+0316, and is lowercased the same way.
    */
   public void testDecomposed2() throws Exception {
-    TokenStream stream = new WhitespaceTokenizer(new StringReader(
+    TokenStream stream = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(
         "\u0049\u0316\u0307STANBUL \u0049\u0307ZM\u0049\u0307R I\u0316SPARTA"));
     TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream);
     assertTokenStreamContents(filter, new String[] {"i\u0316stanbul", "izmir",

Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java Fri Feb 26 13:09:54 2010
@@ -27,12 +27,7 @@
  * <p>
  * SmartChineseAnalyzer has a built-in dictionary and stopword list out-of-box.
  * </p>
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. 
- * The APIs and file formats introduced here might change in the future and will not be 
- * supported anymore in such a case.</font>
- * </p>
- * 
+ * @lucene.experimental
  */
 public class AnalyzerProfile {
 

Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/CharType.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/CharType.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/CharType.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/CharType.java Fri Feb 26 13:09:54 2010
@@ -19,11 +19,7 @@
 
 /**
  * Internal SmartChineseAnalyzer character type constants.
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. 
- * The APIs and file formats introduced here might change in the future and will not be 
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
  */
 public class CharType {
 

Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java Fri Feb 26 13:09:54 2010
@@ -31,11 +31,7 @@
  * <p>
  * The output tokens can then be broken into words with {@link WordTokenFilter}
  * </p>
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. 
- * The APIs and file formats introduced here might change in the future and will not be 
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
  */
 public final class SentenceTokenizer extends Tokenizer {
 
@@ -134,4 +130,11 @@
     super.reset(input);
     reset();
   }
+
+  @Override
+  public void end() throws IOException {
+    // set final offset
+    final int finalOffset = correctOffset(tokenEnd);
+    offsetAtt.setOffset(finalOffset, finalOffset);
+  }
 }

Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java Fri Feb 26 13:09:54 2010
@@ -52,11 +52,7 @@
  * The included dictionary data is from <a href="http://www.ictclas.org">ICTCLAS1.0</a>.
  * Thanks to ICTCLAS for their hard work, and for contributing the data under the Apache 2 License!
  * </p>
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. 
- * The APIs and file formats introduced here might change in the future and will not be 
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
  */
 public final class SmartChineseAnalyzer extends Analyzer {
 

Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/Utility.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/Utility.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/Utility.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/Utility.java Fri Feb 26 13:09:54 2010
@@ -21,11 +21,7 @@
 
 /**
  * SmartChineseAnalyzer utility constants and methods
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. 
- * The APIs and file formats introduced here might change in the future and will not be 
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
  */
 public class Utility {
 

Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java Fri Feb 26 13:09:54 2010
@@ -26,11 +26,7 @@
 
 /**
  * Segment a sentence of Chinese text into words.
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. 
- * The APIs and file formats introduced here might change in the future and will not be 
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
  */
 class WordSegmenter {
 

Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java Fri Feb 26 13:09:54 2010
@@ -30,11 +30,7 @@
 
 /**
  * A {@link TokenFilter} that breaks sentences into words.
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. 
- * The APIs and file formats introduced here might change in the future and will not be 
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
  */
 public final class WordTokenFilter extends TokenFilter {
 

Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordType.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordType.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordType.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordType.java Fri Feb 26 13:09:54 2010
@@ -19,11 +19,7 @@
 
 /**
  * Internal SmartChineseAnalyzer token type constants
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. 
- * The APIs and file formats introduced here might change in the future and will not be 
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
  */
 public class WordType {
 

Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java Fri Feb 26 13:09:54 2010
@@ -26,11 +26,7 @@
  * <p>
  * Contains methods for dealing with GB2312 encoding.
  * </p>
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. 
- * The APIs and file formats introduced here might change in the future and will not be 
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
  */
 abstract class AbstractDictionary {
   /**

Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java Fri Feb 26 13:09:54 2010
@@ -30,11 +30,7 @@
  * <p>
  * For each start offset, a list of possible token pairs is stored.
  * </p>
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. 
- * The APIs and file formats introduced here might change in the future and will not be 
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
  */
 class BiSegGraph {
 

Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java Fri Feb 26 13:09:54 2010
@@ -34,11 +34,7 @@
 
 /**
  * SmartChineseAnalyzer Bigram dictionary.
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. 
- * The APIs and file formats introduced here might change in the future and will not be 
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
  */
 class BigramDictionary extends AbstractDictionary {
 

Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java Fri Feb 26 13:09:54 2010
@@ -26,11 +26,7 @@
 
 /**
  * Finds the optimal segmentation of a sentence into Chinese words
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. 
- * The APIs and file formats introduced here might change in the future and will not be 
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
  */
 public class HHMMSegmenter {
 

Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java Fri Feb 26 13:09:54 2010
@@ -22,11 +22,7 @@
  * <p>
  * Used by {@link BiSegGraph} to maximize the segmentation with the Viterbi algorithm.
  * </p>
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. 
- * The APIs and file formats introduced here might change in the future and will not be 
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
  */
 class PathNode implements Comparable<PathNode> {
   public double weight;

Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java Fri Feb 26 13:09:54 2010
@@ -27,11 +27,7 @@
  * <p>
  * For each start offset, a list of possible tokens is stored.
  * </p>
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. 
- * The APIs and file formats introduced here might change in the future and will not be 
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
  */
 class SegGraph {
 

Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegToken.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegToken.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegToken.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegToken.java Fri Feb 26 13:09:54 2010
@@ -23,11 +23,7 @@
 
 /**
  * SmartChineseAnalyzer internal token
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. 
- * The APIs and file formats introduced here might change in the future and will not be 
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
  */
 public class SegToken {
   /**

Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenFilter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenFilter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenFilter.java Fri Feb 26 13:09:54 2010
@@ -25,11 +25,7 @@
  * Filters a {@link SegToken} by converting full-width latin to half-width, then lowercasing latin.
  * Additionally, all punctuation is converted into {@link Utility#COMMON_DELIMITER}
  * </p>
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. 
- * The APIs and file formats introduced here might change in the future and will not be 
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
  */
 public class SegTokenFilter {
 

Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenPair.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenPair.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenPair.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenPair.java Fri Feb 26 13:09:54 2010
@@ -21,11 +21,7 @@
 
 /**
  * A pair of tokens in {@link SegGraph}
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. 
- * The APIs and file formats introduced here might change in the future and will not be 
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
  */
 class SegTokenPair {
 

Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java Fri Feb 26 13:09:54 2010
@@ -35,12 +35,7 @@
 
 /**
  * SmartChineseAnalyzer Word Dictionary
- *
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. 
- * The APIs and file formats introduced here might change in the future and will not be 
- * supported anymore in such a case.</font>
- * </p>
+ * @lucene.experimental
  */
 class WordDictionary extends AbstractDictionary {
 

Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/package.html
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/package.html?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/package.html (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/package.html Fri Feb 26 13:09:54 2010
@@ -23,11 +23,7 @@
 SmartChineseAnalyzer Hidden Markov Model package.
 </div>
 <div>
-<font color="#FF0000">
-WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. The APIs
-and file formats introduced here might change in the future and will not be supported anymore
-in such a case.
-</font>
+@lucene.experimental
 </div>
 </body>
 </html>

Modified: lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/package.html
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/package.html?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/package.html (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/package.html Fri Feb 26 13:09:54 2010
@@ -24,23 +24,19 @@
 Analyzer for Simplified Chinese, which indexes words.
 </div>
 <div>
-<font color="#FF0000">
-WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. The APIs
-and file formats introduced here might change in the future and will not be supported anymore
-in such a case.
-</font>
+@lucene.experimental
 </div>
 <div>
 Three analyzers are provided for Chinese, each of which treats Chinese text in a different way.
 <ul>
-	<li>ChineseAnalyzer (in the analyzers/cn package): Index unigrams (individual Chinese characters) as a token.
+	<li>StandardAnalyzer: Index unigrams (individual Chinese characters) as a token.
 	<li>CJKAnalyzer (in the analyzers/cjk package): Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens.
 	<li>SmartChineseAnalyzer (in this package): Index words (attempt to segment Chinese text into words) as tokens.
 </ul>
 
 Example phrase: "我是中国人"
 <ol>
-	<li>ChineseAnalyzer: 我-是-中-国-人</li>
+	<li>StandardAnalyzer: 我-是-中-国-人</li>
 	<li>CJKAnalyzer: 我是-是中-中国-国人</li>
 	<li>SmartChineseAnalyzer: 我-是-中国-人</li>
 </ol>

Modified: lucene/java/branches/flex_1458/contrib/ant/src/java/org/apache/lucene/ant/TextDocument.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/ant/src/java/org/apache/lucene/ant/TextDocument.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/ant/src/java/org/apache/lucene/ant/TextDocument.java (original)
+++ lucene/java/branches/flex_1458/contrib/ant/src/java/org/apache/lucene/ant/TextDocument.java Fri Feb 26 13:09:54 2010
@@ -89,7 +89,7 @@
 
     /**
      *@return    The contents value
-     *@todo      finish this method
+     *TODO:      finish this method
      */
     public String getContents() {
         return contents;

Modified: lucene/java/branches/flex_1458/contrib/benchmark/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/benchmark/CHANGES.txt?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/benchmark/CHANGES.txt (original)
+++ lucene/java/branches/flex_1458/contrib/benchmark/CHANGES.txt Fri Feb 26 13:09:54 2010
@@ -2,7 +2,19 @@
 
 The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways.
 
-$Id:$
+2/21/2020
+  LUCENE-2254: Add support to the quality package for running
+  experiments with any combination of Title, Description, and Narrative.
+  (Robert Muir)
+
+1/28/2010
+  LUCENE-2223: Add a benchmark for ShingleFilter. You can wrap any
+  analyzer with ShingleAnalyzerWrapper and specify shingle parameters
+  with the NewShingleAnalyzer task.  (Steven Rowe via Robert Muir)
+
+1/14/2010
+  LUCENE-2210: TrecTopicsReader now properly reads descriptions and
+  narratives from trec topics files.  (Robert Muir)
 
 1/11/2010
   LUCENE-2181: Add a benchmark for collation. This adds NewLocaleTask,

Modified: lucene/java/branches/flex_1458/contrib/benchmark/build.xml
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/benchmark/build.xml?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/benchmark/build.xml (original)
+++ lucene/java/branches/flex_1458/contrib/benchmark/build.xml Fri Feb 26 13:09:54 2010
@@ -128,10 +128,10 @@
     <path id="classpath">
         <pathelement path="${common.dir}/build/classes/java"/>
         <pathelement path="${common.dir}/build/classes/demo"/>
-      <pathelement path="${common.dir}/build/classes/test"/>
         <pathelement path="${common.dir}/build/contrib/highlighter/classes/java"/>
         <pathelement path="${common.dir}/build/contrib/memory/classes/java"/>
         <pathelement path="${common.dir}/build/contrib/fast-vector-highlighter/classes/java"/>
+        <pathelement path="${common.dir}/build/contrib/analyzers/common/classes/java"/>
     	<fileset dir="lib">
     		<include name="**/*.jar"/>
     	</fileset>
@@ -145,7 +145,7 @@
     <property name="task.alg" location="conf/micro-standard.alg"/>
     <property name="task.mem" value="140M"/>
 
-    <target name="run-task" depends="compile-test,check-files,get-files" 
+    <target name="run-task" depends="compile,check-files,get-files" 
      description="Run compound penalty perf test (optional: -Dtask.alg=your-algorithm-file -Dtask.mem=java-max-mem)">
         <echo>Working Directory: ${working.dir}</echo>
         <java classname="org.apache.lucene.benchmark.byTask.Benchmark" maxmemory="${task.mem}" fork="true">
@@ -193,6 +193,32 @@
 	    <echo>Benchmark output in JIRA table format is in file: ${collation.jira.output.file}</echo>
 	</target>
 	
+    <property name="shingle.alg.file" location="conf/shingle.alg"/>
+    <property name="shingle.output.file" 
+              value="${working.dir}/shingle.benchmark.output.txt"/>
+    <property name="shingle.jira.output.file" 
+              value="${working.dir}/shingle.bm2jira.output.txt"/>
+	
+    <path id="shingle.runtime.classpath">
+      <path refid="run.classpath"/>
+    </path>
+	
+    <target name="shingle" depends="compile,compile-analyzers-common,get-files">
+      <echo>Running contrib/benchmark with alg file: ${shingle.alg.file}</echo>
+      <java fork="true" classname="org.apache.lucene.benchmark.byTask.Benchmark" 
+            maxmemory="${task.mem}" output="${shingle.output.file}">
+        <classpath refid="run.classpath"/>
+        <arg file="${shingle.alg.file}"/>
+      </java>
+      <echo>Benchmark output is in file: ${shingle.output.file}</echo>
+      <echo>Converting to JIRA table format...</echo>
+      <exec executable="perl" output="${shingle.jira.output.file}" failonerror="true">
+        <arg value="scripts/shingle.bm2jira.pl"/>
+        <arg value="${shingle.output.file}"/>
+      </exec>
+      <echo>Benchmark output in JIRA table format is in file: ${shingle.jira.output.file}</echo>
+    </target>
+
     <target name="compile-demo">
       <subant target="compile-demo">
          <fileset dir="${common.dir}" includes="build.xml"/>
@@ -208,6 +234,11 @@
          <fileset dir="${common.dir}/contrib/icu" includes="build.xml"/>
       </subant>
     </target>
+    <target name="compile-analyzers-common">
+      <subant target="compile">
+        <fileset dir="${common.dir}/contrib/analyzers/common" includes="build.xml"/>
+      </subant>
+    </target>
     <target name="compile-memory">
       <subant target="compile">
          <fileset dir="${common.dir}/contrib/memory" includes="build.xml"/>
@@ -219,11 +250,6 @@
       </subant>
     </target>
 
-    <target name="init" depends="common.init,compile-demo,compile-memory,compile-highlighter,compile-vector-highlighter,check-files"/>
-
-    <!-- make sure online collections (reuters) are first downloaded -->
-    <target name="test" depends="init,get-files">
-      <antcall target="common.test" inheritRefs="true" />
-    </target>
+    <target name="init" depends="common.init,compile-demo,compile-memory,compile-highlighter,compile-vector-highlighter"/>
     
 </project>

Added: lucene/java/branches/flex_1458/contrib/benchmark/conf/basicNRT.alg
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/benchmark/conf/basicNRT.alg?rev=916666&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/contrib/benchmark/conf/basicNRT.alg (added)
+++ lucene/java/branches/flex_1458/contrib/benchmark/conf/basicNRT.alg Fri Feb 26 13:09:54 2010
@@ -0,0 +1,80 @@
+#/**
+# * Licensed to the Apache Software Foundation (ASF) under one or more
+# * contributor license agreements.  See the NOTICE file distributed with
+# * this work for additional information regarding copyright ownership.
+# * The ASF licenses this file to You under the Apache License, Version 2.0
+# * (the "License"); you may not use this file except in compliance with
+# * the License.  You may obtain a copy of the License at
+# *
+# *     http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+# -------------------------------------------------------------------------------------
+# multi val params are iterated by NewRound's, added to reports, start with column name.
+#
+# based on micro-standard
+#
+# modified to use wikipedia sources and index entire docs
+# currently just used to measure ingest rate
+
+analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
+directory=FSDirectory
+
+work.dir = /x/lucene/wiki.5M
+
+doc.stored=true
+doc.body.stored=false
+doc.tokenized=false
+doc.body.tokenized=true
+doc.term.vector=false
+log.step.AddDoc = 10000
+log.step.Search = 10000
+compound = false
+
+content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource
+content.source.forever = false
+file.query.maker.file = queries.txt
+
+query.maker=org.apache.lucene.benchmark.byTask.feeds.FileBasedQueryMaker
+docs.file = /x/lucene/enwiki-20090306-lines-1k-fixed.txt
+
+# task at this depth or less would print when they start
+task.max.depth.log=2
+
+log.queries=true
+
+# -------------------------------------------------------------------------------------
+
+# Open a writer
+OpenIndex
+{
+  # Get a new near-real-time reader, once per second:
+  NearRealtimeReader(1.0) &
+
+  # Warm
+  Search
+
+  # Index with 2 threads, each adding 100 docs per sec
+  [ "Indexing" { AddDoc > : * : 100/sec ] : 2 &
+
+  # Redline search (from queries.txt) with 4 threads
+  [ "Searching" { Search > : * ] : 4 &
+
+  # Wait 60 sec, then wrap up
+  Wait(5.0)
+}
+CloseReader
+
+# Don't keep any changes, so we can re-test on the same index again
+RollbackIndex
+
+RepSumByPref Indexing
+RepSumByPref Searching
+RepSumByPref NearRealtimeReader
+
+

Propchange: lucene/java/branches/flex_1458/contrib/benchmark/conf/basicNRT.alg
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/branches/flex_1458/contrib/benchmark/conf/collation.alg
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/benchmark/conf/collation.alg?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/benchmark/conf/collation.alg (original)
+++ lucene/java/branches/flex_1458/contrib/benchmark/conf/collation.alg Fri Feb 26 13:09:54 2010
@@ -1,97 +1,97 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource
-content.source.encoding=UTF-8
-doc.tokenized=false
-doc.body.tokenized=true
-docs.file=work/top100k-out/top.fr.wikipedia.words.txt
-content.source.forever=false
-log.step=100000
-
-{ "Rounds"
-    -NewAnalyzer(KeywordAnalyzer)
-    -SetProp(docs.file,work/top100k-out/top.fr.wikipedia.words.txt)
-    ResetInputs
-    { "FrenchKeyword" { ReadTokens > : * ResetInputs } : 10
-
-    -NewAnalyzer(KeywordAnalyzer)
-    -SetProp(docs.file,work/top100k-out/top.de.wikipedia.words.txt)
-    ResetInputs
-    { "GermanKeyword" { ReadTokens > : * ResetInputs } : 10
-
-    -NewAnalyzer(KeywordAnalyzer)
-    -SetProp(docs.file,work/top100k-out/top.uk.wikipedia.words.txt)
-    ResetInputs
-    { "UkrainianKeyword" { ReadTokens > : * ResetInputs } : 10
- 
-    -NewAnalyzer(KeywordAnalyzer)
-    -SetProp(docs.file,work/top100k-out/top.en.wikipedia.words.txt)
-    ResetInputs
-    { "EnglishKeyword" { ReadTokens > : * ResetInputs } : 10
- 
-    -NewLocale(fr)
-    -NewCollationAnalyzer
-    -SetProp(docs.file,work/top100k-out/top.fr.wikipedia.words.txt)
-    ResetInputs
-    { "FrenchJDK" { ReadTokens > : * ResetInputs } : 10
-
-    -NewLocale(de)
-    -NewCollationAnalyzer
-    -SetProp(docs.file,work/top100k-out/top.de.wikipedia.words.txt)
-    ResetInputs
-    { "GermanJDK" { ReadTokens > : * ResetInputs } : 10
-
-    -NewLocale(uk)
-    -NewCollationAnalyzer
-    -SetProp(docs.file,work/top100k-out/top.uk.wikipedia.words.txt)
-    ResetInputs
-    { "UkrainianJDK" { ReadTokens > : * ResetInputs } : 10
-
-    -NewLocale(en)
-    -NewCollationAnalyzer
-    -SetProp(docs.file,work/top100k-out/top.en.wikipedia.words.txt)
-    ResetInputs
-    { "EnglishJDK" { ReadTokens > : * ResetInputs } : 10
-
-    -NewLocale(fr)
-    -NewCollationAnalyzer(impl:icu)
-    -SetProp(docs.file,work/top100k-out/top.fr.wikipedia.words.txt)
-    ResetInputs
-    { "FrenchICU" { ReadTokens > : * ResetInputs } : 10
-
-    -NewLocale(de)
-    -NewCollationAnalyzer(impl:icu)
-    -SetProp(docs.file,work/top100k-out/top.de.wikipedia.words.txt)
-    ResetInputs
-    { "GermanICU" { ReadTokens > : * ResetInputs } : 10
-
-    -NewLocale(uk)
-    -NewCollationAnalyzer(impl:icu)
-    -SetProp(docs.file,work/top100k-out/top.uk.wikipedia.words.txt)
-    ResetInputs
-    { "UkrainianICU" { ReadTokens > : * ResetInputs } : 10
-
-    -NewLocale(en)
-    -NewCollationAnalyzer(impl:icu)
-    -SetProp(docs.file,work/top100k-out/top.en.wikipedia.words.txt)
-    ResetInputs
-    { "EnglishICU" { ReadTokens > : * ResetInputs } : 10
-
-    NewRound
-
-} : 5
-
-RepSumByNameRound
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource
+content.source.encoding=UTF-8
+doc.tokenized=false
+doc.body.tokenized=true
+docs.file=work/top100k-out/top.fr.wikipedia.words.txt
+content.source.forever=false
+log.step=100000
+
+{ "Rounds"
+    -NewAnalyzer(KeywordAnalyzer)
+    -SetProp(docs.file,work/top100k-out/top.fr.wikipedia.words.txt)
+    ResetInputs
+    { "FrenchKeyword" { ReadTokens > : * ResetInputs } : 10
+
+    -NewAnalyzer(KeywordAnalyzer)
+    -SetProp(docs.file,work/top100k-out/top.de.wikipedia.words.txt)
+    ResetInputs
+    { "GermanKeyword" { ReadTokens > : * ResetInputs } : 10
+
+    -NewAnalyzer(KeywordAnalyzer)
+    -SetProp(docs.file,work/top100k-out/top.uk.wikipedia.words.txt)
+    ResetInputs
+    { "UkrainianKeyword" { ReadTokens > : * ResetInputs } : 10
+ 
+    -NewAnalyzer(KeywordAnalyzer)
+    -SetProp(docs.file,work/top100k-out/top.en.wikipedia.words.txt)
+    ResetInputs
+    { "EnglishKeyword" { ReadTokens > : * ResetInputs } : 10
+ 
+    -NewLocale(fr)
+    -NewCollationAnalyzer
+    -SetProp(docs.file,work/top100k-out/top.fr.wikipedia.words.txt)
+    ResetInputs
+    { "FrenchJDK" { ReadTokens > : * ResetInputs } : 10
+
+    -NewLocale(de)
+    -NewCollationAnalyzer
+    -SetProp(docs.file,work/top100k-out/top.de.wikipedia.words.txt)
+    ResetInputs
+    { "GermanJDK" { ReadTokens > : * ResetInputs } : 10
+
+    -NewLocale(uk)
+    -NewCollationAnalyzer
+    -SetProp(docs.file,work/top100k-out/top.uk.wikipedia.words.txt)
+    ResetInputs
+    { "UkrainianJDK" { ReadTokens > : * ResetInputs } : 10
+
+    -NewLocale(en)
+    -NewCollationAnalyzer
+    -SetProp(docs.file,work/top100k-out/top.en.wikipedia.words.txt)
+    ResetInputs
+    { "EnglishJDK" { ReadTokens > : * ResetInputs } : 10
+
+    -NewLocale(fr)
+    -NewCollationAnalyzer(impl:icu)
+    -SetProp(docs.file,work/top100k-out/top.fr.wikipedia.words.txt)
+    ResetInputs
+    { "FrenchICU" { ReadTokens > : * ResetInputs } : 10
+
+    -NewLocale(de)
+    -NewCollationAnalyzer(impl:icu)
+    -SetProp(docs.file,work/top100k-out/top.de.wikipedia.words.txt)
+    ResetInputs
+    { "GermanICU" { ReadTokens > : * ResetInputs } : 10
+
+    -NewLocale(uk)
+    -NewCollationAnalyzer(impl:icu)
+    -SetProp(docs.file,work/top100k-out/top.uk.wikipedia.words.txt)
+    ResetInputs
+    { "UkrainianICU" { ReadTokens > : * ResetInputs } : 10
+
+    -NewLocale(en)
+    -NewCollationAnalyzer(impl:icu)
+    -SetProp(docs.file,work/top100k-out/top.en.wikipedia.words.txt)
+    ResetInputs
+    { "EnglishICU" { ReadTokens > : * ResetInputs } : 10
+
+    NewRound
+
+} : 5
+
+RepSumByNameRound

Propchange: lucene/java/branches/flex_1458/contrib/benchmark/conf/collation.alg
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/java/branches/flex_1458/contrib/benchmark/conf/highlight-vs-vector-highlight.alg
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/java/branches/flex_1458/contrib/benchmark/conf/vector-highlight-profile.alg
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NearRealtimeReaderTask.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NearRealtimeReaderTask.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NearRealtimeReaderTask.java (original)
+++ lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NearRealtimeReaderTask.java Fri Feb 26 13:09:54 2010
@@ -29,8 +29,7 @@
  * single query (body: 1) sorted by docdate, and prints
  * time to reopen and time to run the search.
  *
- * <b>NOTE</b>: this is very experimental at this point, and
- * subject to change.  It's also not generally usable, eg
+ * @lucene.experimental It's also not generally usable, eg
  * you cannot change which query is executed.
  */
 public class NearRealtimeReaderTask extends PerfTask {
@@ -92,6 +91,7 @@
         r = newReader;
       }
     }
+    stopNow = false;
 
     return reopenCount;
   }

Modified: lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java (original)
+++ lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java Fri Feb 26 13:09:54 2010
@@ -134,7 +134,6 @@
    * @return number of work items done by this task.
    */
   public final int runAndMaybeStats(boolean reportStats) throws Exception {
-    stopNow = false;
     if (!reportStats || shouldNotRecordStats()) {
       setup();
       int count = doLogic();

Modified: lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java (original)
+++ lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java Fri Feb 26 13:09:54 2010
@@ -230,6 +230,8 @@
       getRunData().getPoints().getCurrentStats().setCountsByTime(countsByTime, logByTimeMsec);
     }
 
+    stopNow = false;
+
     return count;
   }
 
@@ -276,6 +278,7 @@
         }
       }
     }
+    stopNow = false;
     return count;
   }
 

Modified: lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/QueryDriver.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/QueryDriver.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/QueryDriver.java (original)
+++ lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/QueryDriver.java Fri Feb 26 13:09:54 2010
@@ -30,6 +30,8 @@
 import java.io.File;
 import java.io.FileReader;
 import java.io.PrintWriter;
+import java.util.HashSet;
+import java.util.Set;
 
 
 /**
@@ -38,12 +40,14 @@
  **/
 public class QueryDriver {
   public static void main(String[] args) throws Exception {
-    if (args.length != 4) {
-      System.err.println("Usage: QueryDriver <topicsFile> <qrelsFile> <submissionFile> <indexDir>");
+    if (args.length < 4 || args.length > 5) {
+      System.err.println("Usage: QueryDriver <topicsFile> <qrelsFile> <submissionFile> <indexDir> [querySpec]");
       System.err.println("topicsFile: input file containing queries");
       System.err.println("qrelsFile: input file containing relevance judgements");
       System.err.println("submissionFile: output submission file for trec_eval");
       System.err.println("indexDir: index directory");
+      System.err.println("querySpec: string composed of fields to use in query consisting of T=title,D=description,N=narrative:");
+      System.err.println("\texample: TD (query on Title + Description). The default is T (title only)");
       System.exit(1);
     }
     
@@ -51,6 +55,7 @@
     File qrelsFile = new File(args[1]);
     SubmissionReport submitLog = new SubmissionReport(new PrintWriter(args[2]), "lucene");
     FSDirectory dir = FSDirectory.open(new File(args[3]));
+    String fieldSpec = args.length == 5 ? args[4] : "T"; // default to Title-only if not specified.
     Searcher searcher = new IndexSearcher(dir, true);
 
     int maxResults = 1000;
@@ -68,8 +73,13 @@
     // validate topics & judgments match each other
     judge.validateData(qqs, logger);
 
+    Set<String> fieldSet = new HashSet<String>();
+    if (fieldSpec.indexOf('T') >= 0) fieldSet.add("title");
+    if (fieldSpec.indexOf('D') >= 0) fieldSet.add("description");
+    if (fieldSpec.indexOf('N') >= 0) fieldSet.add("narrative");
+    
     // set the parsing of quality queries into Lucene queries.
-    QualityQueryParser qqParser = new SimpleQQParser("title", "body");
+    QualityQueryParser qqParser = new SimpleQQParser(fieldSet.toArray(new String[0]), "body");
 
     // run the benchmark
     QualityBenchmark qrun = new QualityBenchmark(qqs, qqParser, searcher, docNameField);

Modified: lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TrecTopicsReader.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TrecTopicsReader.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TrecTopicsReader.java (original)
+++ lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TrecTopicsReader.java Fri Feb 26 13:09:54 2010
@@ -76,16 +76,31 @@
         k = sb.indexOf(">");
         String title = sb.substring(k+1).trim();
         // description
-        sb = read(reader,"<desc>",null,false,false);
-        sb = read(reader,"<narr>",null,false,true);
-        String descripion = sb.toString().trim();
+        read(reader,"<desc>",null,false,false);
+        sb.setLength(0);
+        String line = null;
+        while ((line = reader.readLine()) != null) {
+          if (line.startsWith("<narr>"))
+            break;
+          if (sb.length() > 0) sb.append(' ');
+          sb.append(line);
+        }
+        String description = sb.toString().trim();
+        // narrative
+        sb.setLength(0);
+        while ((line = reader.readLine()) != null) {
+          if (line.startsWith("</top>"))
+            break;
+          if (sb.length() > 0) sb.append(' ');
+          sb.append(line);
+        }
+        String narrative = sb.toString().trim();
         // we got a topic!
         fields.put("title",title);
-        fields.put("description",descripion);
+        fields.put("description",description);
+        fields.put("narrative", narrative);
         QualityQuery topic = new QualityQuery(id,fields);
         res.add(topic);
-        // skip narrative, get to end of doc
-        read(reader,"</top>",null,false,false);
       }
     } finally {
       reader.close();

Modified: lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SimpleQQParser.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SimpleQQParser.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SimpleQQParser.java (original)
+++ lucene/java/branches/flex_1458/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SimpleQQParser.java Fri Feb 26 13:09:54 2010
@@ -21,27 +21,38 @@
 import org.apache.lucene.benchmark.quality.QualityQueryParser;
 import org.apache.lucene.queryParser.ParseException;
 import org.apache.lucene.queryParser.QueryParser;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.util.Version;
 
 /**
  * Simplistic quality query parser. A Lucene query is created by passing 
- * the value of the specified QualityQuery name-value pair into 
+ * the value of the specified QualityQuery name-value pair(s) into 
  * a Lucene's QueryParser using StandardAnalyzer. */
 public class SimpleQQParser implements QualityQueryParser {
 
-  private String qqName;
+  private String qqNames[];
   private String indexField;
   ThreadLocal<QueryParser> queryParser = new ThreadLocal<QueryParser>();
 
   /**
    * Constructor of a simple qq parser.
+   * @param qqNames name-value pairs of quality query to use for creating the query
+   * @param indexField corresponding index field  
+   */
+  public SimpleQQParser(String qqNames[], String indexField) {
+    this.qqNames = qqNames;
+    this.indexField = indexField;
+  }
+
+  /**
+   * Constructor of a simple qq parser.
    * @param qqName name-value pair of quality query to use for creating the query
    * @param indexField corresponding index field  
    */
   public SimpleQQParser(String qqName, String indexField) {
-    this.qqName = qqName;
-    this.indexField = indexField;
+    this(new String[] { qqName }, indexField);
   }
 
   /* (non-Javadoc)
@@ -53,7 +64,11 @@
       qp = new QueryParser(Version.LUCENE_CURRENT, indexField, new StandardAnalyzer(Version.LUCENE_CURRENT));
       queryParser.set(qp);
     }
-    return qp.parse(qq.getValue(qqName));
+    BooleanQuery bq = new BooleanQuery();
+    for (int i = 0; i < qqNames.length; i++)
+      bq.add(qp.parse(QueryParser.escape(qq.getValue(qqNames[i]))), BooleanClause.Occur.SHOULD);
+    
+    return bq;
   }
 
 }



Mime
View raw message