lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r1235919 [5/12] - in /lucene/dev/branches/lucene3661: ./ dev-tools/eclipse/ dev-tools/idea/lucene/contrib/ dev-tools/maven/ dev-tools/maven/solr/core/ dev-tools/maven/solr/solrj/ lucene/ lucene/contrib/ lucene/contrib/sandbox/src/test/org/a...
Date Wed, 25 Jan 2012 20:32:53 GMT
Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java Wed Jan 25 20:32:44 2012
@@ -298,4 +298,28 @@ public class TestWordDelimiterFilter ext
         new int[] { 10, 15, 15 },
         new int[] { 2, 1, 0 });
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    int numIterations = atLeast(5);
+    for (int i = 0; i < numIterations; i++) {
+      final int flags = random.nextInt(512);
+      final CharArraySet protectedWords;
+      if (random.nextBoolean()) {
+        protectedWords = new CharArraySet(TEST_VERSION_CURRENT, new HashSet<String>(Arrays.asList("a", "b", "cd")), false);
+      } else {
+        protectedWords = null;
+      }
+      
+      Analyzer a = new Analyzer() {
+        
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+          Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+          return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
+        }
+      };
+      checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+    }
+  }
 }

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java Wed Jan 25 20:32:44 2012
@@ -129,4 +129,27 @@ public class EdgeNGramTokenFilterTest ex
         new int[]    {    0,     0,      0,       0,        0,         0,          0,           0,            0,             0,              0 },
         new int[]    {   11,    11,     11,      11,       11,        11,         11,          11,           11,            11,             11 });
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, 
+            new EdgeNGramTokenFilter(tokenizer, EdgeNGramTokenFilter.Side.FRONT, 2, 15));
+      }    
+    };
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+    
+    Analyzer b = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, 
+            new EdgeNGramTokenFilter(tokenizer, EdgeNGramTokenFilter.Side.BACK, 2, 15));
+      }    
+    };
+    checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java Wed Jan 25 20:32:44 2012
@@ -18,9 +18,13 @@ package org.apache.lucene.analysis.ngram
  */
 
 
+import java.io.Reader;
 import java.io.StringReader;
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
 
 /**
  * Tests {@link EdgeNGramTokenizer} for correctness.
@@ -95,4 +99,25 @@ public class EdgeNGramTokenizerTest exte
     tokenizer.reset(new StringReader("abcde"));
     assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3}, 5 /* abcde */);
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new EdgeNGramTokenizer(reader, EdgeNGramTokenizer.Side.FRONT, 2, 15);
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }    
+    };
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+    
+    Analyzer b = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new EdgeNGramTokenizer(reader, EdgeNGramTokenizer.Side.BACK, 2, 15);
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }    
+    };
+    checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java Wed Jan 25 20:32:44 2012
@@ -23,6 +23,7 @@ import org.apache.lucene.analysis.TokenF
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
 
@@ -33,89 +34,102 @@ import java.io.StringReader;
  * Tests {@link NGramTokenFilter} for correctness.
  */
 public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
-    private TokenStream input;
-    
-    @Override
-    public void setUp() throws Exception {
-        super.setUp();
-        input = new MockTokenizer(new StringReader("abcde"), MockTokenizer.WHITESPACE, false);
-    }
-
-    public void testInvalidInput() throws Exception {
-        boolean gotException = false;
-        try {        
-            new NGramTokenFilter(input, 2, 1);
-        } catch (IllegalArgumentException e) {
-            gotException = true;
-        }
-        assertTrue(gotException);
-    }
-
-    public void testInvalidInput2() throws Exception {
-        boolean gotException = false;
-        try {        
-            new NGramTokenFilter(input, 0, 1);
-        } catch (IllegalArgumentException e) {
-            gotException = true;
-        }
-        assertTrue(gotException);
-    }
-
-    public void testUnigrams() throws Exception {
-      NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1);
-      assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
-    }
-
-    public void testBigrams() throws Exception {
-      NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2);
-      assertTokenStreamContents(filter, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5});
-    }
-
-    public void testNgrams() throws Exception {
-      NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3);
-      assertTokenStreamContents(filter,
+  private TokenStream input;
+  
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    input = new MockTokenizer(new StringReader("abcde"), MockTokenizer.WHITESPACE, false);
+  }
+  
+  public void testInvalidInput() throws Exception {
+    boolean gotException = false;
+    try {        
+      new NGramTokenFilter(input, 2, 1);
+    } catch (IllegalArgumentException e) {
+      gotException = true;
+    }
+    assertTrue(gotException);
+  }
+  
+  public void testInvalidInput2() throws Exception {
+    boolean gotException = false;
+    try {        
+      new NGramTokenFilter(input, 0, 1);
+    } catch (IllegalArgumentException e) {
+      gotException = true;
+    }
+    assertTrue(gotException);
+  }
+  
+  public void testUnigrams() throws Exception {
+    NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1);
+    assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
+  }
+  
+  public void testBigrams() throws Exception {
+    NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2);
+    assertTokenStreamContents(filter, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5});
+  }
+  
+  public void testNgrams() throws Exception {
+    NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3);
+    assertTokenStreamContents(filter,
         new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"}, 
         new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
         new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5}
-      );
-    }
-
-    public void testOversizedNgrams() throws Exception {
-      NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7);
-      assertTokenStreamContents(filter, new String[0], new int[0], new int[0]);
-    }
-    
-    public void testSmallTokenInStream() throws Exception {
-      input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false);
-      NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
-      assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
-    }
-    
-    public void testReset() throws Exception {
-      WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"));
-      NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1);
-      assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
-      tokenizer.reset(new StringReader("abcde"));
-      assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
-    }
-    
-    // LUCENE-3642
-    // EdgeNgram blindly adds term length to offset, but this can take things out of bounds
-    // wrt original text if a previous filter increases the length of the word (in this case æ -> ae)
-    // so in this case we behave like WDF, and preserve any modified offsets
-    public void testInvalidOffsets() throws Exception {
-      Analyzer analyzer = new Analyzer() {
-        @Override
-        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-          Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
-          TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
-          filters = new NGramTokenFilter(filters, 2, 2);
-          return new TokenStreamComponents(tokenizer, filters);
-        }
-      };
-      assertAnalyzesTo(analyzer, "mosfellsbær",
-          new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" },
-          new int[]    {    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
-          new int[]    {   11,   11,   11,   11,   11,   11,   11,   11,   11,   11,   11 });
-    }
+        );
+  }
+  
+  public void testOversizedNgrams() throws Exception {
+    NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7);
+    assertTokenStreamContents(filter, new String[0], new int[0], new int[0]);
+  }
+  
+  public void testSmallTokenInStream() throws Exception {
+    input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false);
+    NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
+    assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
+  }
+  
+  public void testReset() throws Exception {
+    WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"));
+    NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1);
+    assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
+    tokenizer.reset(new StringReader("abcde"));
+    assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
+  }
+  
+  // LUCENE-3642
+  // EdgeNgram blindly adds term length to offset, but this can take things out of bounds
+  // wrt original text if a previous filter increases the length of the word (in this case æ -> ae)
+  // so in this case we behave like WDF, and preserve any modified offsets
+  public void testInvalidOffsets() throws Exception {
+    Analyzer analyzer = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
+        filters = new NGramTokenFilter(filters, 2, 2);
+        return new TokenStreamComponents(tokenizer, filters);
+      }
+    };
+    assertAnalyzesTo(analyzer, "mosfellsbær",
+        new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" },
+        new int[]    {    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+        new int[]    {   11,   11,   11,   11,   11,   11,   11,   11,   11,   11,   11 });
+  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, 
+            new NGramTokenFilter(tokenizer, 2, 15));
+      }    
+    };
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java Wed Jan 25 20:32:44 2012
@@ -18,71 +18,86 @@ package org.apache.lucene.analysis.ngram
  */
 
 
+import java.io.Reader;
 import java.io.StringReader;
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
 
 /**
  * Tests {@link NGramTokenizer} for correctness.
  */
 public class NGramTokenizerTest extends BaseTokenStreamTestCase {
-    private StringReader input;
-    
-    @Override
-    public void setUp() throws Exception {
-        super.setUp();
-        input = new StringReader("abcde");
-    }
-
-    public void testInvalidInput() throws Exception {
-        boolean gotException = false;
-        try {        
-            new NGramTokenizer(input, 2, 1);
-        } catch (IllegalArgumentException e) {
-            gotException = true;
-        }
-        assertTrue(gotException);
-    }
-
-    public void testInvalidInput2() throws Exception {
-        boolean gotException = false;
-        try {        
-            new NGramTokenizer(input, 0, 1);
-        } catch (IllegalArgumentException e) {
-            gotException = true;
-        }
-        assertTrue(gotException);
-    }
-
-    public void testUnigrams() throws Exception {
-        NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
-        assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
-    }
-
-    public void testBigrams() throws Exception {
-        NGramTokenizer tokenizer = new NGramTokenizer(input, 2, 2);
-        assertTokenStreamContents(tokenizer, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5}, 5 /* abcde */);
-    }
-
-    public void testNgrams() throws Exception {
-        NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 3);
-        assertTokenStreamContents(tokenizer,
-          new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"}, 
-          new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
-          new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
-          5 /* abcde */
+  private StringReader input;
+  
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    input = new StringReader("abcde");
+  }
+  
+  public void testInvalidInput() throws Exception {
+    boolean gotException = false;
+    try {        
+      new NGramTokenizer(input, 2, 1);
+    } catch (IllegalArgumentException e) {
+      gotException = true;
+    }
+    assertTrue(gotException);
+  }
+  
+  public void testInvalidInput2() throws Exception {
+    boolean gotException = false;
+    try {        
+      new NGramTokenizer(input, 0, 1);
+    } catch (IllegalArgumentException e) {
+      gotException = true;
+    }
+    assertTrue(gotException);
+  }
+  
+  public void testUnigrams() throws Exception {
+    NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
+    assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
+  }
+  
+  public void testBigrams() throws Exception {
+    NGramTokenizer tokenizer = new NGramTokenizer(input, 2, 2);
+    assertTokenStreamContents(tokenizer, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5}, 5 /* abcde */);
+  }
+  
+  public void testNgrams() throws Exception {
+    NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 3);
+    assertTokenStreamContents(tokenizer,
+        new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"}, 
+        new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
+        new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
+        5 /* abcde */
         );
-    }
-
-    public void testOversizedNgrams() throws Exception {
-        NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7);
-        assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
-    }
-    
-    public void testReset() throws Exception {
-      NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
-      assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
-      tokenizer.reset(new StringReader("abcde"));
-      assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
-    }
+  }
+  
+  public void testOversizedNgrams() throws Exception {
+    NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7);
+    assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
+  }
+  
+  public void testReset() throws Exception {
+    NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
+    assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
+    tokenizer.reset(new StringReader("abcde"));
+    assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
+  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new NGramTokenizer(reader, 2, 15);
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }    
+    };
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java Wed Jan 25 20:32:44 2012
@@ -17,10 +17,13 @@ package org.apache.lucene.analysis.path;
  * limitations under the License.
  */
 
+import java.io.Reader;
 import java.io.StringReader;
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.CharStream;
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.charfilter.MappingCharFilter;
 import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
 
@@ -193,4 +196,16 @@ public class TestPathHierarchyTokenizer 
         new int[]{1},
         path.length());
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new PathHierarchyTokenizer(reader);
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }    
+    };
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/path/TestReversePathHierarchyTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/path/TestReversePathHierarchyTokenizer.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/path/TestReversePathHierarchyTokenizer.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/path/TestReversePathHierarchyTokenizer.java Wed Jan 25 20:32:44 2012
@@ -17,9 +17,13 @@ package org.apache.lucene.analysis.path;
  * limitations under the License.
  */
 
+import java.io.Reader;
 import java.io.StringReader;
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
 
 public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
 
@@ -154,4 +158,16 @@ public class TestReversePathHierarchyTok
         new int[]{1, 0},
         path.length());
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new ReversePathHierarchyTokenizer(reader);
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }    
+    };
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java Wed Jan 25 20:32:44 2012
@@ -18,14 +18,17 @@
 package org.apache.lucene.analysis.pattern;
 
 import java.io.IOException;
+import java.io.Reader;
 import java.io.StringReader;
 import java.util.regex.Pattern;
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.CharReader;
 import org.apache.lucene.analysis.CharStream;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
 
 /**
  * Tests {@link PatternReplaceCharFilter}
@@ -172,4 +175,21 @@ public class TestPatternReplaceCharFilte
   private Pattern pattern( String p ){
     return Pattern.compile( p );
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }
+
+      @Override
+      protected Reader initReader(Reader reader) {
+        return new PatternReplaceCharFilter(Pattern.compile("a"), "b", CharReader.get(reader));
+      }
+    };
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceFilter.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceFilter.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceFilter.java Wed Jan 25 20:32:44 2012
@@ -17,10 +17,13 @@
 
 package org.apache.lucene.analysis.pattern;
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
 
+import java.io.Reader;
 import java.io.StringReader;
 import java.util.regex.Pattern;
 
@@ -77,5 +80,28 @@ public class TestPatternReplaceFilter ex
     assertTokenStreamContents(ts,
         new String[] { "aa$fooaa$fooa$foo$", "a$", "caaaaaaaaa$" });
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        TokenStream filter = new PatternReplaceFilter(tokenizer, Pattern.compile("a"), "b", false);
+        return new TokenStreamComponents(tokenizer, filter);
+      }    
+    };
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+    
+    Analyzer b = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        TokenStream filter = new PatternReplaceFilter(tokenizer, Pattern.compile("a"), "b", true);
+        return new TokenStreamComponents(tokenizer, filter);
+      }    
+    };
+    checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
+  }
 
 }

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizer.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizer.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizer.java Wed Jan 25 20:32:44 2012
@@ -18,17 +18,22 @@
 package org.apache.lucene.analysis.pattern;
 
 import java.io.IOException;
+import java.io.Reader;
 import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.regex.Pattern;
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.CharReader;
 import org.apache.lucene.analysis.CharStream;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
 import org.apache.lucene.analysis.charfilter.MappingCharFilter;
 import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
+import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 
 public class TestPatternTokenizer extends BaseTokenStreamTestCase 
@@ -117,4 +122,35 @@ public class TestPatternTokenizer extend
     in.close();
     return out.toString();
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = null;
+        try {
+          tokenizer = new PatternTokenizer(reader, Pattern.compile("a"), -1);
+        } catch (IOException e) {
+          throw new RuntimeException(e);
+        }
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }    
+    };
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+    
+    Analyzer b = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = null;
+        try {
+          tokenizer = new PatternTokenizer(reader, Pattern.compile("a"), 0);
+        } catch (IOException e) {
+          throw new RuntimeException(e);
+        }
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }    
+    };
+    checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java Wed Jan 25 20:32:44 2012
@@ -17,11 +17,14 @@
 
 package org.apache.lucene.analysis.reverse;
 
+import java.io.Reader;
 import java.io.StringReader;
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.util.Version;
 
 public class TestReverseStringFilter extends BaseTokenStreamTestCase {
@@ -96,4 +99,16 @@ public class TestReverseStringFilter ext
     ReverseStringFilter.reverse(TEST_VERSION_CURRENT, buffer, 3, 7);
     assertEquals("abcfed𩬅愯瀛", new String(buffer));
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new ReverseStringFilter(TEST_VERSION_CURRENT, tokenizer));
+      }
+    };
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java Wed Jan 25 20:32:44 2012
@@ -18,9 +18,12 @@ package org.apache.lucene.analysis.shing
  */
 
 import java.io.IOException;
+import java.io.Reader;
 import java.io.StringReader;
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
@@ -1129,4 +1132,16 @@ public class ShingleFilterTest extends B
     token.setPositionIncrement(positionIncrement);
     return token;
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new ShingleFilter(tokenizer));
+      }
+    };
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java Wed Jan 25 20:32:44 2012
@@ -160,7 +160,7 @@ public class TestSegmentingTokenizerBase
         hasSentence = false;
         clearAttributes();
         termAtt.copyBuffer(buffer, sentenceStart, sentenceEnd-sentenceStart);
-        offsetAtt.setOffset(offset+sentenceStart, offset+sentenceEnd);
+        offsetAtt.setOffset(correctOffset(offset+sentenceStart), correctOffset(offset+sentenceEnd));
         return true;
       } else {
         return false;
@@ -215,7 +215,7 @@ public class TestSegmentingTokenizerBase
       
       clearAttributes();
       termAtt.copyBuffer(buffer, wordStart, wordEnd-wordStart);
-      offsetAtt.setOffset(offset+wordStart, offset+wordEnd);
+      offsetAtt.setOffset(correctOffset(offset+wordStart), correctOffset(offset+wordEnd));
       posIncAtt.setPositionIncrement(posIncAtt.getPositionIncrement() + posBoost);
       posBoost = 0;
       return true;

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerTest.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerTest.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerTest.java Wed Jan 25 20:32:44 2012
@@ -18,12 +18,15 @@
 
 package org.apache.lucene.analysis.wikipedia;
 
+import java.io.Reader;
 import java.io.StringReader;
 import java.io.IOException;
 import java.util.Set;
 import java.util.HashSet;
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
 
 import static org.apache.lucene.analysis.wikipedia.WikipediaTokenizer.*;
@@ -169,4 +172,17 @@ public class WikipediaTokenizerTest exte
     assertFalse(tf.incrementToken());
     tf.close();
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new WikipediaTokenizer(reader);
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      } 
+    };
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/lucene3661/modules/analysis/icu/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/icu/build.xml?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/icu/build.xml (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/icu/build.xml Wed Jan 25 20:32:44 2012
@@ -112,7 +112,24 @@ are part of the ICU4C package. See http:
       </assertions>
     </java>
   </target>
-			
+
+  <property name="html.strip.charfilter.supp.macros.output.file"
+            location="../common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro"/>
+
+  <target name="gen-html-strip-charfilter-supp-macros" depends="compile-tools">
+    <java
+        classname="org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros"
+        dir="."
+        fork="true"
+        failonerror="true"
+        output="${html.strip.charfilter.supp.macros.output.file}">
+      <classpath>
+        <path refid="additional.dependencies"/>
+        <pathelement location="${build.dir}/classes/tools"/>
+      </classpath>
+    </java>
+  </target>
+
   <target name="compile-tools" depends="common.compile-tools">
     <compile
       srcdir="src/tools/java"

Modified: lucene/dev/branches/lucene3661/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java Wed Jan 25 20:32:44 2012
@@ -111,7 +111,7 @@ public final class ICUTokenizer extends 
   @Override
   public void end() throws IOException {
     final int finalOffset = (length < 0) ? offset : offset + length;
-    offsetAtt.setOffset(finalOffset, finalOffset);
+    offsetAtt.setOffset(correctOffset(finalOffset), correctOffset(finalOffset));
   }  
 
   /*

Modified: lucene/dev/branches/lucene3661/modules/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java Wed Jan 25 20:32:44 2012
@@ -102,4 +102,9 @@ public class TestMorfologikAnalyzer exte
     assertPOSToken(ts, "list",  "subst:sg:loc.voc:m3");
     assertPOSToken(ts, "lista", "subst:sg:dat.loc:f");
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandom() throws Exception {
+    checkRandomData(random, getTestAnalyzer(), 10000 * RANDOM_MULTIPLIER); 
+  }
 }

Modified: lucene/dev/branches/lucene3661/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterTest.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterTest.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterTest.java Wed Jan 25 20:32:44 2012
@@ -16,11 +16,17 @@
  */
 package org.apache.lucene.analysis.phonetic;
 
+import java.io.Reader;
 import java.io.StringReader;
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.util._TestUtil;
 
 public class DoubleMetaphoneFilterTest extends BaseTokenStreamTestCase {
 
@@ -65,4 +71,28 @@ public class DoubleMetaphoneFilterTest e
     assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&", "HL" });
   }
 
+  public void testRandom() throws Exception {
+    final int codeLen = _TestUtil.nextInt(random, 1, 8);
+    Analyzer a = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new DoubleMetaphoneFilter(tokenizer, codeLen, false));
+      }
+      
+    };
+    checkRandomData(random, a, 1000 * RANDOM_MULTIPLIER);
+    
+    Analyzer b = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new DoubleMetaphoneFilter(tokenizer, codeLen, true));
+      }
+      
+    };
+    checkRandomData(random, b, 1000 * RANDOM_MULTIPLIER); 
+  }
 }

Modified: lucene/dev/branches/lucene3661/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java Wed Jan 25 20:32:44 2012
@@ -17,6 +17,8 @@
 
 package org.apache.lucene.analysis.phonetic;
 
+import java.io.IOException;
+import java.io.Reader;
 import java.io.StringReader;
 
 import org.apache.commons.codec.Encoder;
@@ -25,7 +27,9 @@ import org.apache.commons.codec.language
 import org.apache.commons.codec.language.Metaphone;
 import org.apache.commons.codec.language.RefinedSoundex;
 import org.apache.commons.codec.language.Soundex;
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 
@@ -70,4 +74,33 @@ public class TestPhoneticFilter extends 
     PhoneticFilter filter = new PhoneticFilter(tokenizer, encoder, inject);
     assertTokenStreamContents(filter, expected);
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws IOException {
+    Encoder encoders[] = new Encoder[] {
+      new Metaphone(), new DoubleMetaphone(), new Soundex(), new RefinedSoundex(), new Caverphone()
+    };
+    
+    for (final Encoder e : encoders) {
+      Analyzer a = new Analyzer() {
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+          Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+          return new TokenStreamComponents(tokenizer, new PhoneticFilter(tokenizer, e, false));
+        }   
+      };
+      
+      checkRandomData(random, a, 1000*RANDOM_MULTIPLIER);
+      
+      Analyzer b = new Analyzer() {
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+          Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+          return new TokenStreamComponents(tokenizer, new PhoneticFilter(tokenizer, e, false));
+        }   
+      };
+      
+      checkRandomData(random, b, 1000*RANDOM_MULTIPLIER);
+    }
+  }
 }

Modified: lucene/dev/branches/lucene3661/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java (original)
+++ lucene/dev/branches/lucene3661/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java Wed Jan 25 20:32:44 2012
@@ -199,9 +199,6 @@ public abstract class PerfTask implement
     return new String(c);
   }
   
-  /* (non-Javadoc)
-   * @see java.lang.Object#toString()
-   */
   @Override
   public String toString() {
     String padd = getPadding();
@@ -248,22 +245,23 @@ public abstract class PerfTask implement
   }
 
   /**
-   * Task setup work that should not be measured for that specific task.
-   * By default it does nothing, but tasks can implement this, moving work from 
-   * doLogic() to this method. Only the work done in doLogicis measured for this task.
-   * Notice that higher level (sequence) tasks containing this task would then 
-   * measure larger time than the sum of their contained tasks.
-   * @throws Exception 
+   * Task setup work that should not be measured for that specific task. By
+   * default it does nothing, but tasks can implement this, moving work from
+   * {@link #doLogic()} to this method. Only the work done in {@link #doLogic()}
+   * is measured for this task. Notice that higher level (sequence) tasks
+   * containing this task would then measure larger time than the sum of their
+   * contained tasks.
    */
   public void setup () throws Exception {
   }
-  
+
   /**
-   * Task tearDown work that should not be measured for that specific task.
-   * By default it does nothing, but tasks can implement this, moving work from 
-   * doLogic() to this method. Only the work done in doLogicis measured for this task.
-   * Notice that higher level (sequence) tasks containing this task would then 
-   * measure larger time than the sum of their contained tasks.
+   * Task tearDown work that should not be measured for that specific task. By
+   * default it does nothing, but tasks can implement this, moving work from
+   * {@link #doLogic()} to this method. Only the work done in {@link #doLogic()}
+   * is measured for this task. Notice that higher level (sequence) tasks
+   * containing this task would then measure larger time than the sum of their
+   * contained tasks.
    */
   public void tearDown() throws Exception {
     if (++logStepCount % logStep == 0) {
@@ -274,16 +272,20 @@ public abstract class PerfTask implement
   }
 
   /**
-   * Sub classes that supports parameters must override this method to return true.
+   * Sub classes that support parameters must override this method to return
+   * true.
+   * 
    * @return true iff this task supports command line params.
    */
   public boolean supportsParams () {
     return false;
   }
-  
+
   /**
    * Set the params of this task.
-   * @exception UnsupportedOperationException for tasks supporting command line parameters.
+   * 
+   * @exception UnsupportedOperationException
+   *              for tasks supporting command line parameters.
    */
   public void setParams(String params) {
     if (!supportsParams()) {

Modified: lucene/dev/branches/lucene3661/modules/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java (original)
+++ lucene/dev/branches/lucene3661/modules/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java Wed Jan 25 20:32:44 2012
@@ -4,6 +4,7 @@ import java.io.IOException;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.Map.Entry;
+import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.locks.ReadWriteLock;
 import java.util.concurrent.locks.ReentrantReadWriteLock;
 import java.util.logging.Level;
@@ -100,6 +101,9 @@ public class DirectoryTaxonomyReader imp
 
   private volatile boolean closed = false;
   
+  // set refCount to 1 at start
+  private final AtomicInteger refCount = new AtomicInteger(1);
+  
   /**
    * Open for reading a taxonomy stored in a given {@link Directory}.
    * @param directory
@@ -130,7 +134,7 @@ public class DirectoryTaxonomyReader imp
    * @throws AlreadyClosedException if this IndexReader is closed
    */
   protected final void ensureOpen() throws AlreadyClosedException {
-    if (indexReader.getRefCount() <= 0) {
+    if (getRefCount() <= 0) {
       throw new AlreadyClosedException("this TaxonomyReader is closed");
     }
   }
@@ -415,8 +419,12 @@ public class DirectoryTaxonomyReader imp
 
   public void close() throws IOException {
     if (!closed) {
-      decRef();
-      closed = true;
+      synchronized (this) {
+        if (!closed) {
+          decRef();
+          closed = true;
+        }
+      }
     }
   }
   
@@ -555,27 +563,31 @@ public class DirectoryTaxonomyReader imp
   }
 
   /**
-   * Expert: decreases the refCount of this TaxonomyReader instance. 
-   * If the refCount drops to 0, then pending changes (if any) are 
-   * committed to the taxonomy index and this reader is closed. 
-   * @throws IOException 
+   * Expert: decreases the refCount of this TaxonomyReader instance. If the
+   * refCount drops to 0, then this reader is closed.
    */
   public void decRef() throws IOException {
     ensureOpen();
-    if (indexReader.getRefCount() == 1) {
-      // Do not decRef the indexReader - doClose does it by calling reader.close()
-      doClose();
-    } else {
-      indexReader.decRef();
+    final int rc = refCount.decrementAndGet();
+    if (rc == 0) {
+      boolean success = false;
+      try {
+        doClose();
+        success = true;
+      } finally {
+        if (!success) {
+          // Put reference back on failure
+          refCount.incrementAndGet();
+        }
+      }
+    } else if (rc < 0) {
+      throw new IllegalStateException("too many decRef calls: refCount is " + rc + " after decrement");
     }
   }
   
-  /**
-   * Expert: returns the current refCount for this taxonomy reader
-   */
+  /** Expert: returns the current refCount for this taxonomy reader */
   public int getRefCount() {
-    ensureOpen();
-    return this.indexReader.getRefCount();
+    return refCount.get();
   }
   
   /**
@@ -587,6 +599,6 @@ public class DirectoryTaxonomyReader imp
    */
   public void incRef() {
     ensureOpen();
-    this.indexReader.incRef();
+    refCount.incrementAndGet();
   }
 }

Modified: lucene/dev/branches/lucene3661/modules/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyReader.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyReader.java (original)
+++ lucene/dev/branches/lucene3661/modules/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyReader.java Wed Jan 25 20:32:44 2012
@@ -11,6 +11,7 @@ import org.apache.lucene.facet.taxonomy.
 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
 import org.apache.lucene.store.AlreadyClosedException;
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.RAMDirectory;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.LuceneTestCase;
 import org.junit.Test;
@@ -178,4 +179,28 @@ public class TestDirectoryTaxonomyReader
     }
   }
   
+  @Test
+  public void testRefreshAndRefCount() throws Exception {
+    Directory dir = new RAMDirectory(); // no need for random directories here
+
+    DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(dir);
+    taxoWriter.addCategory(new CategoryPath("a"));
+    taxoWriter.commit();
+
+    DirectoryTaxonomyReader taxoReader = new DirectoryTaxonomyReader(dir);
+    assertEquals("wrong refCount", 1, taxoReader.getRefCount());
+
+    taxoReader.incRef();
+    assertEquals("wrong refCount", 2, taxoReader.getRefCount());
+
+    taxoWriter.addCategory(new CategoryPath("a", "b"));
+    taxoWriter.commit();
+    taxoReader.refresh();
+    assertEquals("wrong refCount", 2, taxoReader.getRefCount());
+
+    taxoWriter.close();
+    taxoReader.close();
+    dir.close();
+  }
+
 }

Modified: lucene/dev/branches/lucene3661/modules/join/src/java/org/apache/lucene/search/join/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/join/src/java/org/apache/lucene/search/join/package.html?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/join/src/java/org/apache/lucene/search/join/package.html (original)
+++ lucene/dev/branches/lucene3661/modules/join/src/java/org/apache/lucene/search/join/package.html Wed Jan 25 20:32:44 2012
@@ -42,7 +42,7 @@
 <h2>Search-time joins</h2>
 
 <p>
-  The query time joining is terms based and implemented as two pass search. The first pass collects all the terms from a fromField
+  The query time joining is index term based and implemented as two pass search. The first pass collects all the terms from a fromField
   that match the fromQuery. The second pass returns all documents that have matching terms in a toField to the terms
   collected in the first pass.
 </p>
@@ -62,7 +62,7 @@
 <pre class="prettyprint">
   String fromField = "from"; // Name of the from field
   boolean multipleValuesPerDocument = false; // Set only yo true in the case when your fromField has multiple values per document in your index
-  String fromField = "to"; // Name of the to field
+  String toField = "to"; // Name of the to field
   Query fromQuery = new TermQuery(new Term("content", searchTerm)); // Query executed to collect from values to join to the to values
 
   MultiTermQuery joinQuery = JoinUtil.createJoinQuery(fromField, multipleValuesPerDocument, toField, fromQuery, fromSearcher);

Modified: lucene/dev/branches/lucene3661/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/solr/CHANGES.txt?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/solr/CHANGES.txt (original)
+++ lucene/dev/branches/lucene3661/solr/CHANGES.txt Wed Jan 25 20:32:44 2012
@@ -24,11 +24,11 @@ $Id$
 ==================  4.0.0-dev ==================
 Versions of Major Components
 ---------------------
-Apache Tika 0.10
+Apache Tika 1.0
 Carrot2 3.5.0
 Velocity 1.6.4 and Velocity Tools 2.0
 Apache UIMA 2.3.1
-Apache ZooKeeper 3.3.3
+Apache ZooKeeper 3.3.4
 
 
 Upgrading from Solr 3.6-dev
@@ -401,6 +401,14 @@ Upgrading from Solr 3.5
 * As doGet() methods in SimplePostTool was changed to static, the client applications of this
   class need to be recompiled.
 
+* In Solr version 3.5 and earlier, HTMLStripCharFilter had known bugs in the
+  character offsets it provided, triggering e.g. exceptions in highlighting.
+  HTMLStripCharFilter has been re-implemented, addressing this and other
+  issues.  See the entry for LUCENE-3690 in the Bug Fixes section below for a
+  detailed list of changes.  For people who depend on the behavior of
+  HTMLStripCharFilter in Solr version 3.5 and earlier: the old implementation
+  (bugs and all) is preserved as LegacyHTMLStripCharFilter.
+
 New Features
 ----------------------
 * SOLR-2904: BinaryUpdateRequestHandler should be able to accept multiple update requests from
@@ -442,6 +450,10 @@ New Features
 * SOLR-1709: Distributed support for Date and Numeric Range Faceting
   (Peter Sturge, David Smiley, hossman, Simon Willnauer)
 
+* SOLR-3054, LUCENE-3671: Add TypeTokenFilterFactory that creates TypeTokenFilter
+  that filters tokens based on their TypeAttribute.  (Tommaso Teofili via
+  Uwe Schindler)
+
 Optimizations
 ----------------------
 * SOLR-1931: Speedup for LukeRequestHandler and admin/schema browser. New parameter
@@ -483,6 +495,52 @@ Bug Fixes
 
 * SOLR-2970: CSV ResponseWriter returns fields defined as stored=false in schema (janhoy)
 
+* LUCENE-3690, LUCENE-2208, SOLR-882, SOLR-42: Re-implemented
+  HTMLStripCharFilter as a JFlex-generated scanner.  See below for a list
+  of bug fixes and other changes.  To get the same behavior as
+  HTMLStripCharFilter in Solr version 3.5 and earlier (including the bugs),
+  use LegacyHTMLStripCharFilter, which is the previous implementation.
+
+  Behavior changes from the previous version:
+
+  - Known offset bugs are fixed.
+  - The "Mark invalid" exceptions reported in SOLR-1283 are no longer
+    triggered (the bug is still present in LegacyHTMLStripCharFilter).
+  - The character entity "&apos;" is now always properly decoded.
+  - More cases of <script> tags are now properly stripped.
+  - CDATA sections are now handled properly.
+  - Valid tag name characters now include the supplementary Unicode characters
+    from Unicode character classes [:ID_Start:] and [:ID_Continue:].
+  - Uppercase character entities "&QUOT;", "&COPY;", "&GT;", "&LT;", "&REG;",
+    and "&AMP;" are now recognized and handled as if they were in lowercase.
+  - The REPLACEMENT CHARACTER U+FFFD is now used to replace numeric character 
+    entities for unpaired UTF-16 low and high surrogates (in the range
+    [U+D800-U+DFFF]).
+  - Properly paired numeric character entities for UTF-16 surrogates are now
+    converted to the corresponding code units.
+  - Opening tags with unbalanced quotation marks are now properly stripped.
+  - Literal "<" and ">" characters in opening tags, regardless of whether they
+    appear inside quotation marks, now inhibit recognition (and stripping) of
+    the tags.  The only exception to this is for values of event-handler
+    attributes, e.g. "onClick", "onLoad", "onSelect".
+  - A newline '\n' is substituted instead of a space for stripped HTML markup.
+  - Nothing is substituted for opening and closing inline tags - they are
+    simply removed.  The list of inline tags is (case insensitively): <a>,
+    <abbr>, <acronym>, <b>, <basefont>, <bdo>, <big>, <cite>, <code>, <dfn>,
+    <em>, <font>, <i>, <img>, <input>, <kbd>, <label>, <q>, <s>, <samp>,
+    <select>, <small>, <span>, <strike>, <strong>, <sub>, <sup>, <textarea>,
+    <tt>, <u>, and <var>.
+  - HTMLStripCharFilterFactory now handles HTMLStripCharFilter's "escapedTags"
+    feature: opening and closing tags with the given names, including any
+    attributes and their values, are left intact in the output.
+  (Steve Rowe)
+
+* LUCENE-3717: Fixed offset bugs in TrimFilter, WordDelimiterFilter, and
+  HyphenatedWordsFilter where they would create invalid offsets in
+  some situations, leading to problems in highlighting.  (Robert Muir)
+
+* SOLR-2280: commitWithin ignored for a delete query (Juan Grande via janhoy)
+
 Other Changes
 ----------------------
 * SOLR-2922: Upgrade commons-io and commons-lang to 2.1 and 2.6, respectively. (koji)
@@ -498,6 +556,8 @@ Other Changes
 * SOLR-2718: Add ability to lazy load response writers, defined with startup="lazy".
   (ehatcher)
 
+* SOLR-2901: Upgrade Solr to Tika 1.0 (janhoy)
+
 Build
 ----------------------
 * SOLR-2487: Add build target to package war without slf4j jars (janhoy)

Modified: lucene/dev/branches/lucene3661/solr/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/solr/build.xml?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/solr/build.xml (original)
+++ lucene/dev/branches/lucene3661/solr/build.xml Wed Jan 25 20:32:44 2012
@@ -482,7 +482,7 @@
           <packageset dir="contrib/langid/src/java"/>
           <packageset dir="contrib/uima/src/java"/>
           <group title="Core" packages="org.apache.*" />
-          <group title="SolrJ" packages="org.apache.solr.common.*,org.apache.solr.client.solrj*" />
+          <group title="SolrJ" packages="org.apache.solr.common.*,org.apache.solr.client.solrj.*,org.apache.zookeeper.*" />
           <group title="contrib: Clustering" packages="org.apache.solr.handler.clustering*" />
           <group title="contrib: DataImportHandler" packages="org.apache.solr.handler.dataimport*" />
           <group title="contrib: Solr Cell" packages="org.apache.solr.handler.extraction*" />

Modified: lucene/dev/branches/lucene3661/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/MailEntityProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/MailEntityProcessor.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/MailEntityProcessor.java (original)
+++ lucene/dev/branches/lucene3661/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/MailEntityProcessor.java Wed Jan 25 20:32:44 2012
@@ -18,8 +18,8 @@ package org.apache.solr.handler.dataimpo
 
 import com.sun.mail.imap.IMAPMessage;
 
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.utils.ParseUtils;
+import org.apache.tika.Tika;
+import org.apache.tika.metadata.Metadata;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -95,6 +95,8 @@ public class MailEntityProcessor extends
               getStringFromContext("processAttachment",null) == null ? "processAttachement":"processAttachment"
             , true);
 
+    tika = new Tika();
+    
     logConfig();
   }
 
@@ -166,7 +168,10 @@ public class MailEntityProcessor extends
       if (!processAttachment || (disp != null && disp.equalsIgnoreCase(Part.ATTACHMENT)))        return;
       InputStream is = part.getInputStream();
       String fileName = part.getFileName();
-      String content = ParseUtils.getStringContent(is, TikaConfig.getDefaultConfig(), ctype.getBaseType().toLowerCase(Locale.ENGLISH));
+      Metadata md = new Metadata();
+      md.set(Metadata.CONTENT_TYPE, ctype.getBaseType().toLowerCase(Locale.ENGLISH));
+      md.set(Metadata.RESOURCE_NAME_KEY, fileName);
+      String content = tika.parseToString(is, md);
       if (disp != null && disp.equalsIgnoreCase(Part.ATTACHMENT)) {
         if (row.get(ATTACHMENT) == null)
           row.put(ATTACHMENT, new ArrayList<String>());
@@ -529,6 +534,8 @@ public class MailEntityProcessor extends
 
   private boolean processAttachment = true;
 
+  private Tika tika;
+  
   // holds the current state
   private Store mailbox;
   private boolean connected = false;

Modified: lucene/dev/branches/lucene3661/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java (original)
+++ lucene/dev/branches/lucene3661/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java Wed Jan 25 20:32:44 2012
@@ -118,9 +118,7 @@ public class TikaEntityProcessor extends
     }
     Parser tikaParser = null;
     if(parser.equals(AUTO_PARSER)){
-      AutoDetectParser parser = new AutoDetectParser();
-      parser.setConfig(tikaConfig);
-      tikaParser = parser;
+      tikaParser = new AutoDetectParser(tikaConfig);
     } else {
       tikaParser = (Parser) context.getSolrCore().getResourceLoader().newInstance(parser);
     }

Modified: lucene/dev/branches/lucene3661/solr/contrib/dataimporthandler/src/java/org/apache/solr/handler/dataimport/SolrWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/solr/contrib/dataimporthandler/src/java/org/apache/solr/handler/dataimport/SolrWriter.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/solr/contrib/dataimporthandler/src/java/org/apache/solr/handler/dataimport/SolrWriter.java (original)
+++ lucene/dev/branches/lucene3661/solr/contrib/dataimporthandler/src/java/org/apache/solr/handler/dataimport/SolrWriter.java Wed Jan 25 20:32:44 2012
@@ -81,7 +81,7 @@ public class SolrWriter extends DIHWrite
     try {
       log.info("Deleting document: " + id);
       DeleteUpdateCommand delCmd = new DeleteUpdateCommand(req);
-      delCmd.id = id.toString();
+      delCmd.setId(id.toString());
       processor.processDelete(delCmd);
     } catch (IOException e) {
       log.error("Exception while deleteing: " + id, e);

Modified: lucene/dev/branches/lucene3661/solr/contrib/dataimporthandler/src/test/org/apache/solr/handler/dataimport/TestContentStreamDataSource.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/solr/contrib/dataimporthandler/src/test/org/apache/solr/handler/dataimport/TestContentStreamDataSource.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/solr/contrib/dataimporthandler/src/test/org/apache/solr/handler/dataimport/TestContentStreamDataSource.java (original)
+++ lucene/dev/branches/lucene3661/solr/contrib/dataimporthandler/src/test/org/apache/solr/handler/dataimport/TestContentStreamDataSource.java Wed Jan 25 20:32:44 2012
@@ -173,9 +173,8 @@ public class TestContentStreamDataSource
   }
 
   private JettySolrRunner createJetty(SolrInstance instance) throws Exception {
-    System.setProperty("solr.solr.home", instance.getHomeDir());
     System.setProperty("solr.data.dir", instance.getDataDir());
-    JettySolrRunner jetty = new JettySolrRunner("/solr", 0);
+    JettySolrRunner jetty = new JettySolrRunner(instance.getHomeDir(), "/solr", 0);
     jetty.start();
     return jetty;
   }

Modified: lucene/dev/branches/lucene3661/solr/contrib/dataimporthandler/src/test/org/apache/solr/handler/dataimport/TestSolrEntityProcessorEndToEnd.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/solr/contrib/dataimporthandler/src/test/org/apache/solr/handler/dataimport/TestSolrEntityProcessorEndToEnd.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/solr/contrib/dataimporthandler/src/test/org/apache/solr/handler/dataimport/TestSolrEntityProcessorEndToEnd.java (original)
+++ lucene/dev/branches/lucene3661/solr/contrib/dataimporthandler/src/test/org/apache/solr/handler/dataimport/TestSolrEntityProcessorEndToEnd.java Wed Jan 25 20:32:44 2012
@@ -47,7 +47,7 @@ public class TestSolrEntityProcessorEndT
   
   private static Logger LOG = LoggerFactory.getLogger(TestSolrEntityProcessorEndToEnd.class);
   
-  private static final String SOLR_SOURCE_URL = "http://localhost:8983/solr";
+  //rivate static final String SOLR_SOURCE_URL = "http://localhost:8983/solr";
   private static final String SOLR_CONFIG = "dataimport-solrconfig.xml";
   private static final String SOLR_SCHEMA = "dataimport-schema.xml";
   private static final String SOLR_HOME = "dih/solr";
@@ -68,29 +68,36 @@ public class TestSolrEntityProcessorEndT
     solrDoc.put("desc", "SolrDescription");
     SOLR_DOCS.add(solrDoc);
   }
-  
-  private static final String DIH_CONFIG_TAGS_INNER_ENTITY = "<dataConfig>\r\n"
-      + "  <dataSource type='MockDataSource' />\r\n"
-      + "  <document>\r\n"
-      + "    <entity name='db' query='select * from x'>\r\n"
-      + "      <field column='dbid_s' />\r\n"
-      + "      <field column='dbdesc_s' />\r\n"
-      + "      <entity name='se' processor='SolrEntityProcessor' query='id:${db.dbid_s}'\n"
-      + "     url='" + SOLR_SOURCE_URL + "' fields='id,desc'>\r\n"
-      + "        <field column='id' />\r\n"
-      + "        <field column='desc' />\r\n" + "      </entity>\r\n"
-      + "    </entity>\r\n" + "  </document>\r\n" + "</dataConfig>\r\n";
+
   
   private SolrInstance instance = null;
   private JettySolrRunner jetty;
   
-  private static String generateDIHConfig(String options) {
+  private static String getDihConfigTagsInnerEntity(int port) {
+    return  "<dataConfig>\r\n"
+        + "  <dataSource type='MockDataSource' />\r\n"
+        + "  <document>\r\n"
+        + "    <entity name='db' query='select * from x'>\r\n"
+        + "      <field column='dbid_s' />\r\n"
+        + "      <field column='dbdesc_s' />\r\n"
+        + "      <entity name='se' processor='SolrEntityProcessor' query='id:${db.dbid_s}'\n"
+        + "     url='" + getSourceUrl(port) + "' fields='id,desc'>\r\n"
+        + "        <field column='id' />\r\n"
+        + "        <field column='desc' />\r\n" + "      </entity>\r\n"
+        + "    </entity>\r\n" + "  </document>\r\n" + "</dataConfig>\r\n";
+  }
+  
+  private static String generateDIHConfig(String options, int port) {
     return "<dataConfig>\r\n" + "  <document>\r\n"
         + "    <entity name='se' processor='SolrEntityProcessor'" + "   url='"
-        + SOLR_SOURCE_URL + "' " + options + " />\r\n" + "  </document>\r\n"
+        + getSourceUrl(port) + "' " + options + " />\r\n" + "  </document>\r\n"
         + "</dataConfig>\r\n";
   }
   
+  private static String getSourceUrl(int port) {
+    return "http://localhost:" + port + "/solr";
+  }
+  
   //TODO: fix this test to close its directories
   static String savedFactory;
   @BeforeClass
@@ -107,7 +114,7 @@ public class TestSolrEntityProcessorEndT
       System.setProperty("solr.directoryFactory", savedFactory);
     }
   }
-  
+
   @Override
   @Before
   public void setUp() throws Exception {
@@ -138,7 +145,7 @@ public class TestSolrEntityProcessorEndT
     
     try {
       addDocumentsToSolr(SOLR_DOCS);
-      runFullImport(generateDIHConfig("query='*:*' rows='2' fields='id,desc' onError='skip'"));
+      runFullImport(generateDIHConfig("query='*:*' rows='2' fields='id,desc' onError='skip'", jetty.getLocalPort()));
     } catch (Exception e) {
       LOG.error(e.getMessage(), e);
       fail(e.getMessage());
@@ -156,7 +163,7 @@ public class TestSolrEntityProcessorEndT
       addDocumentsToSolr(generateSolrDocuments(30));
       Map<String,String> map = new HashMap<String,String>();
       map.put("rows", "50");
-      runFullImport(generateDIHConfig("query='*:*' fq='desc:Description1*,desc:Description*2' rows='2'"), map);
+      runFullImport(generateDIHConfig("query='*:*' fq='desc:Description1*,desc:Description*2' rows='2'", jetty.getLocalPort()), map);
     } catch (Exception e) {
       LOG.error(e.getMessage(), e);
       fail(e.getMessage());
@@ -171,7 +178,7 @@ public class TestSolrEntityProcessorEndT
     
     try {
       addDocumentsToSolr(generateSolrDocuments(7));
-      runFullImport(generateDIHConfig("query='*:*' fields='id' rows='2'"));
+      runFullImport(generateDIHConfig("query='*:*' fields='id' rows='2'", jetty.getLocalPort()));
     } catch (Exception e) {
       LOG.error(e.getMessage(), e);
       fail(e.getMessage());
@@ -197,7 +204,7 @@ public class TestSolrEntityProcessorEndT
     try {
       MockDataSource.setIterator("select * from x", DB_DOCS.iterator());
       addDocumentsToSolr(SOLR_DOCS);
-      runFullImport(DIH_CONFIG_TAGS_INNER_ENTITY);
+      runFullImport(getDihConfigTagsInnerEntity(jetty.getLocalPort()));
     } catch (Exception e) {
       LOG.error(e.getMessage(), e);
       fail(e.getMessage());
@@ -224,7 +231,7 @@ public class TestSolrEntityProcessorEndT
     assertQ(req("*:*"), "//result[@numFound='0']");
     
     try {
-      runFullImport(generateDIHConfig("query='*:*' rows='2' fields='id,desc' onError='skip'"));
+      runFullImport(generateDIHConfig("query='*:*' rows='2' fields='id,desc' onError='skip'", jetty.getLocalPort()));
     } catch (Exception e) {
       LOG.error(e.getMessage(), e);
       fail(e.getMessage());
@@ -237,7 +244,7 @@ public class TestSolrEntityProcessorEndT
     assertQ(req("*:*"), "//result[@numFound='0']");
     
     try {
-      runFullImport(generateDIHConfig("query='bogus:3' rows='2' fields='id,desc' onError='abort'"));
+      runFullImport(generateDIHConfig("query='bogus:3' rows='2' fields='id,desc' onError='abort'", jetty.getLocalPort()));
     } catch (Exception e) {
       LOG.error(e.getMessage(), e);
       fail(e.getMessage());
@@ -255,8 +262,7 @@ public class TestSolrEntityProcessorEndT
       addDocumentsToSolr(docList);
       Map<String,String> map = new HashMap<String,String>();
       map.put("rows", "50");
-      runFullImport(generateDIHConfig("query='*:*' rows='6' numThreads='4'"),
-          map);
+      runFullImport(generateDIHConfig("query='*:*' rows='6' numThreads='4'", jetty.getLocalPort()), map);
     } catch (Exception e) {
       LOG.error(e.getMessage(), e);
       fail(e.getMessage());
@@ -287,7 +293,7 @@ public class TestSolrEntityProcessorEndT
     }
     
     HttpClient client = new HttpClient(new MultiThreadedHttpConnectionManager());
-    URL url = new URL(SOLR_SOURCE_URL);
+    URL url = new URL(getSourceUrl(jetty.getLocalPort()));
     CommonsHttpSolrServer solrServer = new CommonsHttpSolrServer(url, client);
     solrServer.add(sidl);
     solrServer.commit(true, true);
@@ -343,9 +349,8 @@ public class TestSolrEntityProcessorEndT
   }
   
   private JettySolrRunner createJetty(SolrInstance instance) throws Exception {
-    System.setProperty("solr.solr.home", instance.getHomeDir());
     System.setProperty("solr.data.dir", instance.getDataDir());
-    JettySolrRunner jetty = new JettySolrRunner("/solr", 8983);
+    JettySolrRunner jetty = new JettySolrRunner(instance.getHomeDir(), "/solr", 0);
     jetty.start();
     return jetty;
   }

Modified: lucene/dev/branches/lucene3661/solr/contrib/extraction/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/solr/contrib/extraction/CHANGES.txt?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/solr/contrib/extraction/CHANGES.txt (original)
+++ lucene/dev/branches/lucene3661/solr/contrib/extraction/CHANGES.txt Wed Jan 25 20:32:44 2012
@@ -20,7 +20,7 @@ to your Solr Home lib directory.  See ht
 Tika Dependency
 ---------------
 
-Current Version: Tika 0.10 (released 2011-09-30)
+Current Version: Tika 1.0 (released 2011-11-07)
 
 $Id$
 
@@ -34,6 +34,8 @@ $Id$
   This is convenient when Tika's auto detector cannot detect encoding, especially
   the text file is too short to detect encoding. (koji)
 
+* SOLR-2901: Upgrade Solr to Tika 1.0 (janhoy)
+
 ================== Release 3.5.0 ==================
 
 * SOLR-2372: Upgrade Solr to Tika 0.10 (janhoy)

Modified: lucene/dev/branches/lucene3661/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java (original)
+++ lucene/dev/branches/lucene3661/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java Wed Jan 25 20:32:44 2012
@@ -39,6 +39,7 @@ import org.apache.tika.exception.TikaExc
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.DefaultParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.XHTMLContentHandler;
@@ -138,7 +139,7 @@ public class ExtractingDocumentLoader ex
     if (streamType != null) {
       //Cache?  Parsers are lightweight to construct and thread-safe, so I'm told
       MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ENGLISH));
-      parser = config.getParser(mt);
+      parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt);
     } else {
       parser = autoDetectParser;
     }
@@ -151,6 +152,10 @@ public class ExtractingDocumentLoader ex
       if (resourceName != null) {
         metadata.add(Metadata.RESOURCE_NAME_KEY, resourceName);
       }
+      // Provide stream's content type as hint for auto detection
+      if(stream.getContentType() != null) {
+        metadata.add(Metadata.CONTENT_TYPE, stream.getContentType());
+      }
 
       InputStream inputStream = null;
       try {

Modified: lucene/dev/branches/lucene3661/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java (original)
+++ lucene/dev/branches/lucene3661/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java Wed Jan 25 20:32:44 2012
@@ -18,7 +18,6 @@ package org.apache.solr.handler.extracti
 
 import java.util.ArrayList;
 import java.util.List;
-
 import org.apache.solr.SolrTestCaseJ4;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.util.ContentStream;
@@ -419,7 +418,33 @@ public class ExtractingRequestHandlerTes
     assertU(commit());
     assertQ(req("*:*"), "//result[@numFound=1]");
   }
+  
+  @Test
+  public void testWrongStreamType() throws Exception {
+    ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
+    assertTrue("handler is null and it shouldn't be", handler != null);
+
+    try{
+      // Load plain text specifying another mime type, should fail
+      loadLocal("extraction/version_control.txt", 
+              "literal.id", "one",
+              ExtractingParams.STREAM_TYPE, "application/pdf"
+      );
+      fail("SolrException is expected because wrong parser specified for the file type");
+    }
+    catch(Exception expected){}
 
+    try{
+      // Load plain text specifying non existing mimetype, should fail
+      loadLocal("extraction/version_control.txt", 
+              "literal.id", "one",
+              ExtractingParams.STREAM_TYPE, "foo/bar"
+      );
+      fail("SolrException is expected because nonexsisting parser specified");
+    }
+    catch(Exception expected){}
+  }
+  
   SolrQueryResponse loadLocal(String filename, String... args) throws Exception {
     LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args);
     try {



Mime
View raw message