lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r805400 [2/2] - in /lucene/java/trunk/contrib: analyzers/common/src/java/org/apache/lucene/analysis/ar/ analyzers/common/src/java/org/apache/lucene/analysis/br/ analyzers/common/src/java/org/apache/lucene/analysis/cjk/ analyzers/common/src/...
Date Tue, 18 Aug 2009 12:55:28 GMT
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java?rev=805400&r1=805399&r2=805400&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
Tue Aug 18 12:55:26 2009
@@ -29,10 +29,12 @@
 import org.apache.lucene.analysis.Tokenizer;
 
 /**
- * Analyzer for Russian language. Supports an external list of stopwords (words that
+ * {@link Analyzer} for Russian language. 
+ * <p>
+ * Supports an external list of stopwords (words that
  * will not be indexed at all).
  * A default set of stopwords is used unless an alternative list is specified.
- *
+ * </p>
  *
  * @version $Id$
  */
@@ -246,10 +248,13 @@
     }
 
     /**
-     * Creates a TokenStream which tokenizes all the text in the provided Reader.
+     * Creates a {@link TokenStream} which tokenizes all the text in the 
+     * provided {@link Reader}.
      *
-     * @return  A TokenStream built from a RussianLetterTokenizer filtered with
-     *                  RussianLowerCaseFilter, StopFilter, and RussianStemFilter
+     * @return  A {@link TokenStream} built from a 
+     *   {@link RussianLetterTokenizer} filtered with 
+     *   {@link RussianLowerCaseFilter}, {@link StopFilter}, 
+     *   and {@link RussianStemFilter}
      */
     public TokenStream tokenStream(String fieldName, Reader reader)
     {
@@ -266,11 +271,13 @@
     };
     
     /**
-     * Returns a (possibly reused) TokenStream which tokenizes all the text 
-     * in the provided Reader.
+     * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text 
+     * in the provided {@link Reader}.
      *
-     * @return  A TokenStream built from a RussianLetterTokenizer filtered with
-     *                  RussianLowerCaseFilter, StopFilter, and RussianStemFilter
+     * @return  A {@link TokenStream} built from a 
+     *   {@link RussianLetterTokenizer} filtered with 
+     *   {@link RussianLowerCaseFilter}, {@link StopFilter}, 
+     *   and {@link RussianStemFilter}
      */
     public TokenStream reusableTokenStream(String fieldName, Reader reader) 
       throws IOException {

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java?rev=805400&r1=805399&r2=805400&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java
Tue Aug 18 12:55:26 2009
@@ -19,10 +19,11 @@
 /**
  * RussianCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
  * for russian characters in Unicode, KOI8 and CP1252.
+ * <p>
  * Each encoding scheme contains lowercase (positions 0-31) and uppercase (position 32-63)
characters.
  * One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding
a new charset
  * and adding logic to toLowerCase() method for that charset.
- *
+ * </p>
  *
  * @version $Id$
  */

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java?rev=805400&r1=805399&r2=805400&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
Tue Aug 18 12:55:26 2009
@@ -19,13 +19,18 @@
 
 import java.io.Reader;
 import org.apache.lucene.analysis.CharTokenizer;
+import org.apache.lucene.analysis.Tokenizer; // for javadocs
+import org.apache.lucene.analysis.LetterTokenizer; // for javadocs
 
 /**
- * A RussianLetterTokenizer is a tokenizer that extends LetterTokenizer by additionally looking
up letters
- * in a given "russian charset". The problem with LeterTokenizer is that it uses Character.isLetter()
method,
+ * A RussianLetterTokenizer is a {@link Tokenizer} that extends {@link LetterTokenizer}
+ * by additionally looking up letters in a given "russian charset". 
+ * <p>
+ * The problem with 
+ * {@link LetterTokenizer} is that it uses {@link Character#isLetter(char)} method,
  * which doesn't know how to detect letters in encodings like CP1252 and KOI8
  * (well-known problems with 0xD7 and 0xF7 chars)
- *
+ * </p>
  *
  * @version $Id$
  */

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java?rev=805400&r1=805399&r2=805400&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
Tue Aug 18 12:55:26 2009
@@ -20,7 +20,6 @@
 import java.io.IOException;
 
 import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java?rev=805400&r1=805399&r2=805400&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
Tue Aug 18 12:55:26 2009
@@ -17,7 +17,6 @@
  * limitations under the License.
  */
 
-import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
@@ -25,10 +24,12 @@
 import java.io.IOException;
 
 /**
- * A filter that stems Russian words. The implementation was inspired by GermanStemFilter.
- * The input should be filtered by RussianLowerCaseFilter before passing it to RussianStemFilter
,
- * because RussianStemFilter only works  with lowercase part of any "russian" charset.
- *
+ * A {@link TokenFilter} that stems Russian words. 
+ * <p>
+ * The implementation was inspired by GermanStemFilter.
+ * The input should be filtered by {@link RussianLowerCaseFilter} before passing it to RussianStemFilter
,
+ * because RussianStemFilter only works with lowercase part of any "russian" charset.
+ * </p>
  *
  * @version   $Id$
  */
@@ -66,7 +67,7 @@
 
 
     /**
-     * Set a alternative/custom RussianStemmer for this filter.
+     * Set a alternative/custom {@link RussianStemmer} for this filter.
      */
     public void setStemmer(RussianStemmer stemmer)
     {

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java?rev=805400&r1=805399&r2=805400&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java
Tue Aug 18 12:55:26 2009
@@ -25,8 +25,10 @@
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 
 /**
- * A ShingleAnalyzerWrapper wraps a ShingleFilter around another analyzer. A
- * shingle is another namefor a token based n-gram.
+ * A ShingleAnalyzerWrapper wraps a {@link ShingleFilter} around another {@link Analyzer}.
+ * <p>
+ * A shingle is another name for a token based n-gram.
+ * </p>
  */
 public class ShingleAnalyzerWrapper extends Analyzer {
 

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java?rev=805400&r1=805399&r2=805400&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
Tue Aug 18 12:55:26 2009
@@ -76,7 +76,7 @@
 
   /**
    * Constructs a ShingleFilter with the specified single size from the
-   * TokenStream <code>input</code>
+   * {@link TokenStream} <code>input</code>
    *
    * @param input input stream
    * @param maxShingleSize maximum shingle size produced by the filter.

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java?rev=805400&r1=805399&r2=805400&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java
Tue Aug 18 12:55:26 2009
@@ -129,7 +129,7 @@
     /**
      * Retrieves information on how a {@link org.apache.lucene.analysis.Token} is to be inserted
to a {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix}.
      * @param token
-     * @return
+     * @return {@link ShingleMatrixFilter.TokenPositioner}
      * @throws IOException
      */
     public abstract TokenPositioner getTokenPositioner(Token token) throws IOException;
@@ -1014,7 +1014,7 @@
      * Returns a 32 bit float from the payload, or 1f it null.
      *
      * @param token
-     * @return
+     * @return 32 bit float
      */
     public float getWeight(Token token) {
       if (token.getPayload() == null || token.getPayload().getData() == null) {

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/package.html?rev=805400&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/package.html
(added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/package.html
Tue Aug 18 12:55:26 2009
@@ -0,0 +1,5 @@
+<html><head></head>
+<body>
+Word n-gram filters
+</body>
+</html>

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/package.html
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java?rev=805400&r1=805399&r2=805400&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
Tue Aug 18 12:55:26 2009
@@ -27,7 +27,7 @@
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 
 /**
- * Analyzer for Thai language. It uses java.text.BreakIterator to break words.
+ * {@link Analyzer} for Thai language. It uses {@link java.text.BreakIterator} to break words.
  * @version 0.2
  */
 public class ThaiAnalyzer extends Analyzer {

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java?rev=805400&r1=805399&r2=805400&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
Tue Aug 18 12:55:26 2009
@@ -28,7 +28,7 @@
 import java.text.BreakIterator;
 
 /**
- * TokenFilter that use java.text.BreakIterator to break each 
+ * {@link TokenFilter} that use {@link java.text.BreakIterator} to break each 
  * Token that is Thai into separate Token(s) for each Thai word.
  * @version 0.2
  */

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/package.html?rev=805400&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/package.html
(added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/package.html
Tue Aug 18 12:55:26 2009
@@ -0,0 +1,5 @@
+<html><head></head>
+<body>
+Analyzer for Thai.
+</body>
+</html>

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/package.html
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java?rev=805400&r1=805399&r2=805400&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
Tue Aug 18 12:55:26 2009
@@ -118,6 +118,14 @@
 	 check("quiosque", "quiosqu");
   }
   
+  public void testNormalization() throws Exception {
+    check("Brasil", "brasil"); // lowercase by default
+    check("Brasília", "brasil"); // remove diacritics
+    check("quimio5terápicos", "quimio5terapicos"); // contains non-letter, diacritic will
still be removed
+    check("áá", "áá"); // token is too short: diacritics are not removed
+    check("ááá", "aaa"); // normally, diacritics are removed
+  }
+  
   public void testReusableTokenStream() throws Exception {
     Analyzer a = new BrazilianAnalyzer();
     checkReuse(a, "boa", "boa");
@@ -126,6 +134,11 @@
     checkReuse(a, "bôas", "boas"); // removes diacritic: different from snowball portugese
   }
  
+  public void testStemExclusionTable() throws Exception {
+    BrazilianAnalyzer a = new BrazilianAnalyzer();
+    a.setStemExclusionTable(new String[] { "quintessência" });
+    checkReuse(a, "quintessência", "quintessência"); // excluded words will be completely
unchanged.
+  }
 
   private void check(final String input, final String expected) throws IOException {
     Analyzer analyzer = new BrazilianAnalyzer(); 

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java?rev=805400&r1=805399&r2=805400&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java
Tue Aug 18 12:55:26 2009
@@ -169,6 +169,66 @@
     checkCJKToken(str, out_tokens);
   }
   
+  /*
+   * Full-width text is normalized to half-width 
+   */
+  public void testFullWidth() throws Exception {
+    String str = "Test 1234";
+    TestToken[] out_tokens = { 
+        newToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE), 
+        newToken("1234", 5, 9, CJKTokenizer.SINGLE_TOKEN_TYPE)
+    };
+    checkCJKToken(str, out_tokens);
+  }
+  
+  /*
+   * Non-english text (not just CJK) is treated the same as CJK: C1C2 C2C3 
+   */
+  public void testNonIdeographic() throws Exception {
+    String str = "\u4e00 روبرت موير";
+    TestToken[] out_tokens = {
+        newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+        newToken("رو", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+        newToken("وب", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+        newToken("بر", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+        newToken("رت", 5, 7, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+        newToken("مو", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+        newToken("وي", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+        newToken("ير", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE)
+    };
+    checkCJKToken(str, out_tokens);
+  }
+  
+  /*
+   * Non-english text with nonletters (non-spacing marks,etc) is treated as C1C2 C2C3,
+   * except for words are split around non-letters.
+   */
+  public void testNonIdeographicNonLetter() throws Exception {
+    String str = "\u4e00 رُوبرت موير";
+    TestToken[] out_tokens = {
+        newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+        newToken("ر", 2, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+        newToken("وب", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+        newToken("بر", 5, 7, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+        newToken("رت", 6, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+        newToken("مو", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+        newToken("وي", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+        newToken("ير", 11, 13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
+    };
+    checkCJKToken(str, out_tokens);
+  }
+  
+  public void testTokenStream() throws Exception {
+    Analyzer analyzer = new CJKAnalyzer();
+    TokenStream ts = analyzer.tokenStream("dummy", new StringReader("\u4e00\u4e01\u4e02"));
+    TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
+    assertTrue(ts.incrementToken());
+    assertEquals("\u4e00\u4e01", termAtt.term());
+    assertTrue(ts.incrementToken());
+    assertEquals("\u4e01\u4e02", termAtt.term());
+    assertFalse(ts.incrementToken());
+  }
+  
   public void testReusableTokenStream() throws Exception {
     Analyzer analyzer = new CJKAnalyzer();
     String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java?rev=805400&r1=805399&r2=805400&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java
Tue Aug 18 12:55:26 2009
@@ -18,12 +18,15 @@
  */
 
 import java.io.IOException;
+import java.io.Reader;
 import java.io.StringReader;
 
 import junit.framework.TestCase;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
@@ -59,6 +62,76 @@
         new int[] { 1, 2, 3 });
     }
     
+    /*
+     * Analyzer that just uses ChineseTokenizer, not ChineseFilter.
+     * convenience to show the behavior of the tokenizer
+     */
+    private class JustChineseTokenizerAnalyzer extends Analyzer {
+      public TokenStream tokenStream(String fieldName, Reader reader) {
+        return new ChineseTokenizer(reader);
+      }   
+    }
+    
+    /*
+     * Analyzer that just uses ChineseFilter, not ChineseTokenizer.
+     * convenience to show the behavior of the filter.
+     */
+    private class JustChineseFilterAnalyzer extends Analyzer {
+      public TokenStream tokenStream(String fieldName, Reader reader) {
+        return new ChineseFilter(new WhitespaceTokenizer(reader));
+      }
+    }
+    
+    /*
+     * ChineseTokenizer tokenizes numbers as one token, but they are filtered by ChineseFilter
+     */
+    public void testNumerics() throws Exception
+    { 
+      Analyzer justTokenizer = new JustChineseTokenizerAnalyzer();
+      assertAnalyzesTo(justTokenizer, "中1234", new String[] { "中", "1234" });
+          
+      // in this case the ChineseAnalyzer (which applies ChineseFilter) will remove the numeric
token.
+      Analyzer a = new ChineseAnalyzer(); 
+      assertAnalyzesTo(a, "中1234", new String[] { "中" });
+    }
+    
+    /*
+     * ChineseTokenizer tokenizes english similar to SimpleAnalyzer.
+     * it will lowercase terms automatically.
+     * 
+     * ChineseFilter has an english stopword list, it also removes any single character tokens.
+     * the stopword list is case-sensitive.
+     */
+    public void testEnglish() throws Exception
+    {
+      Analyzer chinese = new ChineseAnalyzer();
+      assertAnalyzesTo(chinese, "This is a Test. b c d",
+          new String[] { "test" });
+      
+      Analyzer justTokenizer = new JustChineseTokenizerAnalyzer();
+      assertAnalyzesTo(justTokenizer, "This is a Test. b c d",
+          new String[] { "this", "is", "a", "test", "b", "c", "d" });
+      
+      Analyzer justFilter = new JustChineseFilterAnalyzer();
+      assertAnalyzesTo(justFilter, "This is a Test. b c d", 
+          new String[] { "This", "Test." });
+    }
+    
+    private void assertAnalyzesTo(Analyzer a, String input, String[] output)
+      throws Exception {
+      TokenStream ts = a.tokenStream("dummy", new StringReader(input));
+      TermAttribute termAtt = (TermAttribute) ts
+      .getAttribute(TermAttribute.class);
+
+     for (int i = 0; i < output.length; i++) {
+       assertTrue(ts.incrementToken());
+       assertEquals(output[i], termAtt.term());
+     }
+
+     assertFalse(ts.incrementToken());
+     ts.close();
+    }
+    
     private void assertAnalyzesToReuse(Analyzer a, String input, String[] output,
       int startOffsets[], int endOffsets[])
       throws Exception {

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java?rev=805400&r1=805399&r2=805400&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
Tue Aug 18 12:55:26 2009
@@ -90,12 +90,12 @@
   }
 
   private void check(final String input, final String expected) throws IOException {
-    StandardTokenizer tokenStream = new StandardTokenizer(new StringReader(input));
-    GermanStemFilter filter = new GermanStemFilter(tokenStream);
-    TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class);
-    assertTrue(filter.incrementToken());
+    Analyzer a = new GermanAnalyzer();
+    TokenStream tokenStream = a.tokenStream("dummy", new StringReader(input));
+    TermAttribute termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
+    assertTrue(tokenStream.incrementToken());
     assertEquals(expected, termAtt.term());
-    filter.close();
+    tokenStream.close();
   }
   
   private void checkReuse(Analyzer a, String input, String expected) throws IOException {

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java?rev=805400&r1=805399&r2=805400&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java
Tue Aug 18 12:55:26 2009
@@ -18,9 +18,11 @@
 
 import junit.framework.TestCase;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.LetterTokenizer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.WhitespaceAnalyzer;
 import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.index.IndexReader;
@@ -35,6 +37,7 @@
 
 import java.io.IOException;
 import java.io.Reader;
+import java.io.StringReader;
 
 public class QueryAutoStopWordAnalyzerTest extends TestCase {
   String variedFieldValues[] = {"the", "quick", "brown", "fox", "jumped", "over", "the",
"lazy", "boring", "dog"};
@@ -162,4 +165,37 @@
     Hits h = search(a, "repetitiveField:boring");
     assertFalse(h.length() == 0);
   }
+  
+  /*
+   * analyzer that does not support reuse
+   * it is LetterTokenizer on odd invocations, WhitespaceTokenizer on even.
+   */
+  private class NonreusableAnalyzer extends Analyzer {
+    int invocationCount = 0;
+    public TokenStream tokenStream(String fieldName, Reader reader) {
+      if (++invocationCount % 2 == 0)
+        return new WhitespaceTokenizer(reader);
+      else
+        return new LetterTokenizer(reader);
+    }
+  }
+  
+  public void testWrappingNonReusableAnalyzer() throws Exception {
+    QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(new NonreusableAnalyzer());
+    a.addStopWords(reader, 10);
+    Hits h = search(a, "repetitiveField:boring");
+    assertTrue(h.length() == 0);
+    h = search(a, "repetitiveField:vaguelyboring");
+    assertTrue(h.length() == 0);
+  }
+  
+  public void testTokenStream() throws Exception {
+    QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(new WhitespaceAnalyzer());
+    a.addStopWords(reader, 10);
+    TokenStream ts = a.tokenStream("repetitiveField", new StringReader("this boring"));
+    TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
+    assertTrue(ts.incrementToken());
+    assertEquals("this", termAtt.term());
+    assertFalse(ts.incrementToken());
+  }
 }

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java?rev=805400&r1=805399&r2=805400&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java
Tue Aug 18 12:55:26 2009
@@ -336,7 +336,9 @@
    * @throws IOException
    */
   public void testMatrix() throws IOException {
-
+    // some other tests set this to null.
+    // set it here in case tests are run out of the usual order.
+    ShingleMatrixFilter.defaultSettingsCodec = new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec();
     Matrix matrix = new Matrix();
 
     matrix.new Column(tokenFactory("no", 1));

Modified: lucene/java/trunk/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java?rev=805400&r1=805399&r2=805400&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java
Tue Aug 18 12:55:26 2009
@@ -57,7 +57,7 @@
    * This test is the same as the above, except using an ideographic space as a separator.
    * This tests to ensure the stopwords are working correctly.
    */
-  public void testChineseStopWordsDefaultTwoPhrasesIdeoSpache() throws Exception {
+  public void testChineseStopWordsDefaultTwoPhrasesIdeoSpace() throws Exception {
     Analyzer ca = new SmartChineseAnalyzer(); /* will load stopwords */
     String sentence = "我购买了道具和服装 我购买了道具和服装。";
     String result[] = { "我", "购买", "了", "道具", "和", "服装",
"我", "购买", "了", "道具", "和", "服装" };
@@ -101,6 +101,52 @@
         new String[] { "我", "购买", "test", "了", "道具", "和",
"服装"});
   }
   
+  /*
+   * Numerics are parsed as their own tokens
+   */
+  public void testNumerics() throws Exception {
+    assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买 Tests 了道具和服装1234",
+      new String[] { "我", "购买", "test", "了", "道具", "和",
"服装", "1234"});
+  }
+  
+  /*
+   * Full width alphas and numerics are folded to half-width
+   */
+  public void testFullWidth() throws Exception {
+    assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买 Tests
了道具和服装1234",
+        new String[] { "我", "购买", "test", "了", "道具", "和",
"服装", "1234"});
+  }
+  
+  /*
+   * Presentation form delimiters are removed
+   */
+  public void testDelimiters() throws Exception {
+    assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买︱ Tests 了道具和服装",

+        new String[] { "我", "购买", "test", "了", "道具", "和",
"服装"});
+  }
+  
+  /*
+   * Text from writing systems other than Chinese and Latin are parsed as individual characters.
+   * (regardless of Unicode category)
+   */
+  public void testNonChinese() throws Exception {
+    assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买 روبرتTests
了道具和服装", 
+        new String[] { "我", "购买", "ر", "و", "ب", "ر", "ت",
"test", "了", "道具", "和", "服装"});
+  }
+  
+  /*
+   * Test what the analyzer does with out-of-vocabulary words.
+   * In this case the name is Yousaf Raza Gillani.
+   * Currently it is being analyzed into single characters...
+   */
+  public void testOOV() throws Exception {
+    assertAnalyzesTo(new SmartChineseAnalyzer(true), "优素福·拉扎·吉拉尼",
+      new String[] { "优", "素", "福", "拉", "扎", "吉", "拉",
"å°¼" });
+    
+    assertAnalyzesTo(new SmartChineseAnalyzer(true), "优素福拉扎吉拉尼",
+      new String[] { "优", "素", "福", "拉", "扎", "吉", "拉",
"å°¼" });
+  }
+  
   public void testOffsets() throws Exception {
     assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买了道具和服装",
         new String[] { "我", "购买", "了", "道具", "和", "服装"
},

Modified: lucene/java/trunk/contrib/memory/src/test/org/apache/lucene/index/memory/TestSynonymTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/memory/src/test/org/apache/lucene/index/memory/TestSynonymTokenFilter.java?rev=805400&r1=805399&r2=805400&view=diff
==============================================================================
--- lucene/java/trunk/contrib/memory/src/test/org/apache/lucene/index/memory/TestSynonymTokenFilter.java
(original)
+++ lucene/java/trunk/contrib/memory/src/test/org/apache/lucene/index/memory/TestSynonymTokenFilter.java
Tue Aug 18 12:55:26 2009
@@ -109,6 +109,7 @@
         streams.source = new WhitespaceTokenizer(reader);
         streams.result = new LowerCaseFilter(streams.source);
         streams.result = new SynonymTokenFilter(streams.result, synonyms, maxSynonyms);
+        setPreviousTokenStream(streams);
       } else {
         streams.source.reset(reader);
         streams.result.reset(); // reset the SynonymTokenFilter



Mime
View raw message