lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r885216 - in /lucene/java/trunk/contrib: ./ analyzers/common/src/java/org/apache/lucene/analysis/cz/ analyzers/common/src/test/org/apache/lucene/analysis/cz/
Date Sun, 29 Nov 2009 11:59:38 GMT
Author: rmuir
Date: Sun Nov 29 11:59:38 2009
New Revision: 885216

URL: http://svn.apache.org/viewvc?rev=885216&view=rev
Log:
LUCENE-2067: Add a stemmer for Czech

Added:
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilter.java
  (with props)
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java
  (with props)
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java
  (with props)
Modified:
    lucene/java/trunk/contrib/CHANGES.txt
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java

Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=885216&r1=885215&r2=885216&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Sun Nov 29 11:59:38 2009
@@ -10,6 +10,11 @@
    now reverses supplementary characters correctly if used with Version > 3.0.
    (Simon Willnauer, Robert Muir)
    
+New features
+
+ * LUCENE-2067: Add a Czech light stemmer. CzechAnalyzer will now stem words
+   when Version is set to 3.1 or higher.  (Robert Muir)
+   
 
 ======================= Release 3.0.0 2009-11-25 =======================
 

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java?rev=885216&r1=885215&r2=885216&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
Sun Nov 29 11:59:38 2009
@@ -26,7 +26,6 @@
 import org.apache.lucene.analysis.WordlistLoader;
 import org.apache.lucene.analysis.standard.StandardFilter;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;  // for javadoc
 import org.apache.lucene.util.Version;
 
 import java.io.*;
@@ -36,19 +35,27 @@
 import java.util.Collections;
 
 /**
- * {@link Analyzer} for Czech language. 
+ * {@link Analyzer} for Czech language.
  * <p>
- * Supports an external list of stopwords (words that
- * will not be indexed at all). 
- * A default set of stopwords is used unless an alternative list is specified.
+ * Supports an external list of stopwords (words that will not be indexed at
+ * all). A default set of stopwords is used unless an alternative list is
+ * specified.
  * </p>
- *
- * <p><b>NOTE</b>: This class uses the same {@link Version}
- * dependent settings as {@link StandardAnalyzer}.</p>
+ * 
+ * <a name="version"/>
+ * <p>
+ * You must specify the required {@link Version} compatibility when creating
+ * CzechAnalyzer:
+ * <ul>
+ * <li>As of 3.1, words are stemmed with {@link CzechStemFilter}
+ * <li>As of 2.9, StopFilter preserves position increments
+ * <li>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see
+ * <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
+ * </ul>
  */
 public final class CzechAnalyzer extends Analyzer {
 
-	/**
+  /**
 	 * List of typical stopwords.
 	 * @deprecated use {@link #getDefaultStopSet()} instead
 	 */
@@ -74,10 +81,11 @@
         "jeho\u017e","j\u00ed\u017e","jeliko\u017e","je\u017e","jako\u017e","na\u010de\u017e",
     };
 	
-	/**
-	 * Returns a set of default Czech-stopwords 
-	 * @return a set of default Czech-stopwords 
-	 */
+  /**
+   * Returns a set of default Czech-stopwords
+   * 
+   * @return a set of default Czech-stopwords
+   */
 	public static final Set<?> getDefaultStopSet(){
 	  return DefaultSetHolder.DEFAULT_SET;
 	}
@@ -87,27 +95,29 @@
 	      Arrays.asList(CZECH_STOP_WORDS), false));
 	}
 
-	/**
-	 * Contains the stopwords used with the {@link StopFilter}.
-	 */
+  /**
+   * Contains the stopwords used with the {@link StopFilter}.
+   */
 	// TODO make this final in 3.1
 	private Set<?> stoptable;
   private final Version matchVersion;
 
-	/**
-	 * Builds an analyzer with the default stop words ({@link #CZECH_STOP_WORDS}).
-	 */
+  /**
+   * Builds an analyzer with the default stop words ({@link #CZECH_STOP_WORDS}).
+   * 
+   * @param matchVersion Lucene version to match See
+   *          {@link <a href="#version">above</a>}
+   */
 	public CzechAnalyzer(Version matchVersion) {
     this(matchVersion, DefaultSetHolder.DEFAULT_SET);
 	}
 	
-	/**
-   * Builds an analyzer with the given stop words and stemming exclusion words
+  /**
+   * Builds an analyzer with the given stop words.
    * 
-   * @param matchVersion
-   *          lucene compatibility version
-   * @param stopwords
-   *          a stopword set
+   * @param matchVersion Lucene version to match See
+   *          {@link <a href="#version">above</a>}
+   * @param stopwords a stopword set
    */
   public CzechAnalyzer(Version matchVersion, Set<?> stopwords) {
     this.matchVersion = matchVersion;
@@ -115,10 +125,14 @@
   }
 
 
-	/**
-	 * Builds an analyzer with the given stop words.
-	 * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
-	 */
+  /**
+   * Builds an analyzer with the given stop words.
+   * 
+   * @param matchVersion Lucene version to match See
+   *          {@link <a href="#version">above</a>}
+   * @param stopwords a stopword set
+   * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
+   */
   public CzechAnalyzer(Version matchVersion, String... stopwords) {
     this(matchVersion, StopFilter.makeStopSet( stopwords ));
 	}
@@ -126,16 +140,23 @@
   /**
    * Builds an analyzer with the given stop words.
    * 
+   * @param matchVersion Lucene version to match See
+   *          {@link <a href="#version">above</a>}
+   * @param stopwords a stopword set
    * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
    */
   public CzechAnalyzer(Version matchVersion, HashSet<?> stopwords) {
     this(matchVersion, (Set<?>)stopwords);
 	}
 
-	/**
-	 * Builds an analyzer with the given stop words.
-	 * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
-	 */
+  /**
+   * Builds an analyzer with the given stop words.
+   * 
+   * @param matchVersion Lucene version to match See
+   *          {@link <a href="#version">above</a>}
+   * @param stopwords a file containing stopwords
+   * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
+   */
   public CzechAnalyzer(Version matchVersion, File stopwords ) throws IOException {
     this(matchVersion, (Set<?>)WordlistLoader.getWordSet( stopwords ));
 	}
@@ -171,19 +192,24 @@
         }
     }
 
-	/**
-	 * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
-	 *
-	 * @return  A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
-	 * 			{@link StandardFilter}, {@link LowerCaseFilter}, and {@link StopFilter}
-	 */
-	@Override
+  /**
+   * Creates a {@link TokenStream} which tokenizes all the text in the provided
+   * {@link Reader}.
+   * 
+   * @return A {@link TokenStream} built from a {@link StandardTokenizer}
+   *         filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+   *         {@link StopFilter}, and {@link CzechStemFilter} (only if version is
+   *         >= LUCENE_31)
+   */
+  @Override
 	public final TokenStream tokenStream( String fieldName, Reader reader ) {
                 TokenStream result = new StandardTokenizer( matchVersion, reader );
 		result = new StandardFilter( result );
 		result = new LowerCaseFilter( matchVersion, result );
 		result = new StopFilter( StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
                                          result, stoptable );
+		if (matchVersion.onOrAfter(Version.LUCENE_31))
+		  result = new CzechStemFilter(result);
 		return result;
 	}
 	
@@ -192,13 +218,15 @@
 	    TokenStream result;
 	};
 	
-	/**
-     * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text in 
-     * the provided {@link Reader}.
-     *
-     * @return  A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
-     *          {@link StandardFilter}, {@link LowerCaseFilter}, and {@link StopFilter}
-     */
+  /**
+   * Returns a (possibly reused) {@link TokenStream} which tokenizes all the
+   * text in the provided {@link Reader}.
+   * 
+   * @return A {@link TokenStream} built from a {@link StandardTokenizer}
+   *         filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+   *         {@link StopFilter}, and {@link CzechStemFilter} (only if version is
+   *         >= LUCENE_31)
+   */
 	@Override
 	public TokenStream reusableTokenStream(String fieldName, Reader reader)
       throws IOException {
@@ -210,6 +238,8 @@
         streams.result = new LowerCaseFilter(matchVersion, streams.result);
         streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
                                         streams.result, stoptable);
+        if (matchVersion.onOrAfter(Version.LUCENE_31))
+          streams.result = new CzechStemFilter(streams.result);
         setPreviousTokenStream(streams);
       } else {
         streams.source.reset(reader);

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilter.java?rev=885216&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilter.java
(added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilter.java
Sun Nov 29 11:59:38 2009
@@ -0,0 +1,52 @@
+package org.apache.lucene.analysis.cz;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A {@link TokenFilter} that applies {@link CzechStemmer} to stem Czech words.
+ * 
+ * <p><b>NOTE</b>: Input is expected to be in lowercase, 
+ * but with diacritical marks</p>
+ */
+public final class CzechStemFilter extends TokenFilter {
+  private final CzechStemmer stemmer;
+  private final TermAttribute termAtt;
+  
+  public CzechStemFilter(TokenStream input) {
+    super(input);
+    stemmer = new CzechStemmer();
+    termAtt = addAttribute(TermAttribute.class);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength());
+      termAtt.setTermLength(newlen);
+      return true;
+    } else {
+      return false;
+    }
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java?rev=885216&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java
(added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java
Sun Nov 29 11:59:38 2009
@@ -0,0 +1,181 @@
+package org.apache.lucene.analysis.cz;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Light Stemmer for Czech.
+ * <p>
+ * Implements the algorithm described in:  
+ * <i>
+ * Indexing and stemming approaches for the Czech language
+ * </i>
+ * http://portal.acm.org/citation.cfm?id=1598600
+ * </p>
+ */
+public class CzechStemmer {
+  
+  /**
+   * Stem an input buffer of Czech text.
+   * 
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return length of input buffer after normalization
+   * 
+   * <p><b>NOTE</b>: Input is expected to be in lowercase, 
+   * but with diacritical marks</p>
+   */
+  public int stem(char s[], int len) {
+    len = removeCase(s, len);
+    len = removePossessives(s, len);
+    len = normalize(s, len);
+    return len;
+  }
+  
+  private int removeCase(char s[], int len) {  
+    if (len > 7 && endsWith(s, len, "atech"))
+      return len - 5;
+    
+    if (len > 6 && 
+        (endsWith(s, len,"ětem") ||
+        endsWith(s, len,"etem") ||
+        endsWith(s, len,"atům")))
+      return len - 4;
+        
+    if (len > 5 && 
+        (endsWith(s, len, "ech") ||
+        endsWith(s, len, "ich") ||
+        endsWith(s, len, "ích") ||
+        endsWith(s, len, "ého") ||
+        endsWith(s, len, "ěmi") ||
+        endsWith(s, len, "emi") ||
+        endsWith(s, len, "ému") ||
+        endsWith(s, len, "ěte") ||
+        endsWith(s, len, "ete") ||
+        endsWith(s, len, "ěti") ||
+        endsWith(s, len, "eti") ||
+        endsWith(s, len, "ího") ||
+        endsWith(s, len, "iho") ||
+        endsWith(s, len, "ími") ||
+        endsWith(s, len, "ímu") ||
+        endsWith(s, len, "imu") ||
+        endsWith(s, len, "ách") ||
+        endsWith(s, len, "ata") ||
+        endsWith(s, len, "aty") ||
+        endsWith(s, len, "ých") ||
+        endsWith(s, len, "ama") ||
+        endsWith(s, len, "ami") ||
+        endsWith(s, len, "ové") ||
+        endsWith(s, len, "ovi") ||
+        endsWith(s, len, "ými")))
+      return len - 3;
+    
+    if (len > 4 && 
+        (endsWith(s, len, "em") ||
+        endsWith(s, len, "es") ||
+        endsWith(s, len, "ém") ||
+        endsWith(s, len, "ím") ||
+        endsWith(s, len, "ům") ||
+        endsWith(s, len, "at") ||
+        endsWith(s, len, "ám") ||
+        endsWith(s, len, "os") ||
+        endsWith(s, len, "us") ||
+        endsWith(s, len, "ým") ||
+        endsWith(s, len, "mi") ||
+        endsWith(s, len, "ou")))
+      return len - 2;
+    
+    if (len > 3) {
+      switch (s[len - 1]) {
+        case 'a':
+        case 'e':
+        case 'i':
+        case 'o':
+        case 'u':
+        case 'ů':
+        case 'y':
+        case 'á':
+        case 'é':
+        case 'í':
+        case 'ý':
+        case 'ě':
+          return len - 1;
+      }
+    }
+    
+    return len;
+  }
+  
+  private int removePossessives(char s[], int len) {
+    if (len > 5 &&
+        (endsWith(s, len, "ov") ||
+        endsWith(s, len, "in") ||
+        endsWith(s, len, "ův")))
+      return len - 2;
+
+    return len;
+  }
+  
+  private int normalize(char s[], int len) {
+    if (endsWith(s, len, "čt")) { // čt -> ck
+      s[len - 2] = 'c';
+      s[len - 1] = 'k';
+      return len;
+    }
+    
+    if (endsWith(s, len, "Å¡t")) { // Å¡t -> sk
+      s[len - 2] = 's';
+      s[len - 1] = 'k';
+      return len;
+    }
+    
+    switch(s[len - 1]) {
+      case 'c': // [cč] -> k
+      case 'č':
+        s[len - 1] = 'k';
+        return len;
+      case 'z': // [zž] -> h
+      case 'ž':
+        s[len - 1] = 'h';
+        return len;
+    }
+    
+    if (len > 1 && s[len - 2] == 'e') {
+      s[len - 2] = s[len - 1]; // e* > *
+      return len - 1;
+    }
+    
+    if (len > 2 && s[len - 2] == 'ů') {
+      s[len - 2] = 'o'; // *ů* -> *o*
+      return len;
+    }
+
+    return len;
+  }
+  
+  private boolean endsWith(char s[], int len, String suffix) {
+    int suffixLen = suffix.length();
+    if (suffixLen > len)
+      return false;
+    
+    for (int i = suffixLen - 1; i >= 0; i--)
+      if (s[len - (suffixLen - i)] != suffix.charAt(i))
+        return false;
+    
+    return true;
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java?rev=885216&r1=885215&r2=885216&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java
Sun Nov 29 11:59:38 2009
@@ -24,31 +24,50 @@
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.util.Version;
 
 /**
  * Test the CzechAnalyzer
  * 
- * CzechAnalyzer is like a StandardAnalyzer with a custom stopword list.
+ * Before Lucene 3.1, CzechAnalyzer was a StandardAnalyzer with a custom 
+ * stopword list. As of 3.1 it also includes a stemmer.
  *
  */
 public class TestCzechAnalyzer extends BaseTokenStreamTestCase {
   File dataDir = new File(System.getProperty("dataDir", "./bin"));
   File customStopFile = new File(dataDir, "org/apache/lucene/analysis/cz/customStopWordFile.txt");
   
+  /**
+   * @deprecated Remove this test when support for 3.0 indexes is no longer needed.
+   */
+  public void testStopWordLegacy() throws Exception {
+    assertAnalyzesTo(new CzechAnalyzer(Version.LUCENE_30), "Pokud mluvime o volnem", 
+        new String[] { "mluvime", "volnem" });
+  }
+  
   public void testStopWord() throws Exception {
-    assertAnalyzesTo(new CzechAnalyzer(Version.LUCENE_CURRENT), "Pokud mluvime o volnem",
new String[] { "mluvime", "volnem" });
+    assertAnalyzesTo(new CzechAnalyzer(Version.LUCENE_CURRENT), "Pokud mluvime o volnem",

+        new String[] { "mluvim", "voln" });
   }
-    
-  public void testReusableTokenStream() throws Exception {
-    Analyzer analyzer = new CzechAnalyzer(Version.LUCENE_CURRENT);
+  
+  /**
+   * @deprecated Remove this test when support for 3.0 indexes is no longer needed.
+   */
+  public void testReusableTokenStreamLegacy() throws Exception {
+    Analyzer analyzer = new CzechAnalyzer(Version.LUCENE_30);
     assertAnalyzesToReuse(analyzer, "Pokud mluvime o volnem", new String[] { "mluvime", "volnem"
});
     assertAnalyzesToReuse(analyzer, "Česká Republika", new String[] { "česká",
"republika" });
   }
+  
+  public void testReusableTokenStream() throws Exception {
+    Analyzer analyzer = new CzechAnalyzer(Version.LUCENE_CURRENT);
+    assertAnalyzesToReuse(analyzer, "Pokud mluvime o volnem", new String[] { "mluvim", "voln"
});
+    assertAnalyzesToReuse(analyzer, "Česká Republika", new String[] { "česk", "republik"
});
+  }
 
-  /*
+  /**
    * An input stream that always throws IOException for testing.
+   * @deprecated Remove this class when the loadStopWords method is removed.
    */
   private class UnreliableInputStream extends InputStream {
     @Override
@@ -57,24 +76,26 @@
     }
   }
   
-  /*
+  /**
    * The loadStopWords method does not throw IOException on error,
    * instead previously it set the stoptable to null (versus empty)
    * this would cause a NPE when it is time to create the StopFilter.
+   * @deprecated Remove this test when the loadStopWords method is removed.
    */
   public void testInvalidStopWordFile() throws Exception {
-    CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
+    CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_30);
     cz.loadStopWords(new UnreliableInputStream(), "UTF-8");
     assertAnalyzesTo(cz, "Pokud mluvime o volnem",
         new String[] { "pokud", "mluvime", "o", "volnem" });
   }
   
-  /* 
+  /** 
    * Test that changes to the stop table via loadStopWords are applied immediately
    * when using reusable token streams.
+   * @deprecated Remove this test when the loadStopWords method is removed.
    */
   public void testStopWordFileReuse() throws Exception {
-    CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
+    CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_30);
     assertAnalyzesToReuse(cz, "Česká Republika", 
       new String[] { "česká", "republika" });
     

Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java?rev=885216&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java
(added)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java
Sun Nov 29 11:59:38 2009
@@ -0,0 +1,273 @@
+package org.apache.lucene.analysis.cz;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
+
+/**
+ * Test the Czech Stemmer.
+ * 
+ * Note: its algorithmic, so some stems are nonsense
+ *
+ */
+public class TestCzechStemmer extends BaseTokenStreamTestCase {
+  
+  /**
+   * Test showing how masculine noun forms conflate
+   */
+  public void testMasculineNouns() throws IOException {
+    CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
+    
+    /* animate ending with a hard consonant */
+    assertAnalyzesTo(cz, "pán", new String[] { "pán" });
+    assertAnalyzesTo(cz, "páni", new String[] { "pán" });
+    assertAnalyzesTo(cz, "pánové", new String[] { "pán" });
+    assertAnalyzesTo(cz, "pána", new String[] { "pán" });
+    assertAnalyzesTo(cz, "pánů", new String[] { "pán" });
+    assertAnalyzesTo(cz, "pánovi", new String[] { "pán" });
+    assertAnalyzesTo(cz, "pánům", new String[] { "pán" });
+    assertAnalyzesTo(cz, "pány", new String[] { "pán" });
+    assertAnalyzesTo(cz, "páne", new String[] { "pán" });
+    assertAnalyzesTo(cz, "pánech", new String[] { "pán" });
+    assertAnalyzesTo(cz, "pánem", new String[] { "pán" });
+    
+    /* inanimate ending with hard consonant */
+    assertAnalyzesTo(cz, "hrad", new String[] { "hrad" });
+    assertAnalyzesTo(cz, "hradu", new String[] { "hrad" });
+    assertAnalyzesTo(cz, "hrade", new String[] { "hrad" });
+    assertAnalyzesTo(cz, "hradem", new String[] { "hrad" });
+    assertAnalyzesTo(cz, "hrady", new String[] { "hrad" });
+    assertAnalyzesTo(cz, "hradech", new String[] { "hrad" });
+    assertAnalyzesTo(cz, "hradům", new String[] { "hrad" });
+    assertAnalyzesTo(cz, "hradů", new String[] { "hrad" });
+    
+    /* animate ending with a soft consonant */
+    assertAnalyzesTo(cz, "muž", new String[] { "muh" });
+    assertAnalyzesTo(cz, "muži", new String[] { "muh" });
+    assertAnalyzesTo(cz, "muže", new String[] { "muh" });
+    assertAnalyzesTo(cz, "mužů", new String[] { "muh" });
+    assertAnalyzesTo(cz, "mužům", new String[] { "muh" });
+    assertAnalyzesTo(cz, "mužích", new String[] { "muh" });
+    assertAnalyzesTo(cz, "mužem", new String[] { "muh" });
+    
+    /* inanimate ending with a soft consonant */
+    assertAnalyzesTo(cz, "stroj", new String[] { "stroj" });
+    assertAnalyzesTo(cz, "stroje", new String[] { "stroj" });
+    assertAnalyzesTo(cz, "strojů", new String[] { "stroj" });
+    assertAnalyzesTo(cz, "stroji", new String[] { "stroj" });
+    assertAnalyzesTo(cz, "strojům", new String[] { "stroj" });
+    assertAnalyzesTo(cz, "strojích", new String[] { "stroj" });
+    assertAnalyzesTo(cz, "strojem", new String[] { "stroj" });
+    
+    /* ending with a */
+    assertAnalyzesTo(cz, "předseda", new String[] { "předsd" });
+    assertAnalyzesTo(cz, "předsedové", new String[] { "předsd" });
+    assertAnalyzesTo(cz, "předsedy", new String[] { "předsd" });
+    assertAnalyzesTo(cz, "předsedů", new String[] { "předsd" });
+    assertAnalyzesTo(cz, "předsedovi", new String[] { "předsd" });
+    assertAnalyzesTo(cz, "předsedům", new String[] { "předsd" });
+    assertAnalyzesTo(cz, "předsedu", new String[] { "předsd" });
+    assertAnalyzesTo(cz, "předsedo", new String[] { "předsd" });
+    assertAnalyzesTo(cz, "předsedech", new String[] { "předsd" });
+    assertAnalyzesTo(cz, "předsedou", new String[] { "předsd" });
+    
+    /* ending with e */
+    assertAnalyzesTo(cz, "soudce", new String[] { "soudk" });
+    assertAnalyzesTo(cz, "soudci", new String[] { "soudk" });
+    assertAnalyzesTo(cz, "soudců", new String[] { "soudk" });
+    assertAnalyzesTo(cz, "soudcům", new String[] { "soudk" });
+    assertAnalyzesTo(cz, "soudcích", new String[] { "soudk" });
+    assertAnalyzesTo(cz, "soudcem", new String[] { "soudk" });
+  }
+  
+  /**
+   * Test showing how feminine noun forms conflate
+   */
+  public void testFeminineNouns() throws IOException {
+    CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
+    
+    /* ending with hard consonant */
+    assertAnalyzesTo(cz, "kost", new String[] { "kost" });
+    assertAnalyzesTo(cz, "kosti", new String[] { "kost" });
+    assertAnalyzesTo(cz, "kostí", new String[] { "kost" });
+    assertAnalyzesTo(cz, "kostem", new String[] { "kost" });
+    assertAnalyzesTo(cz, "kostech", new String[] { "kost" });
+    assertAnalyzesTo(cz, "kostmi", new String[] { "kost" });
+    
+    /* ending with a soft consonant */
+    // note: in this example sing nom. and sing acc. don't conflate w/ the rest
+    assertAnalyzesTo(cz, "píseň", new String[] { "písň" });
+    assertAnalyzesTo(cz, "písně", new String[] { "písn" });
+    assertAnalyzesTo(cz, "písni", new String[] { "písn" });
+    assertAnalyzesTo(cz, "písněmi", new String[] { "písn" });
+    assertAnalyzesTo(cz, "písních", new String[] { "písn" });
+    assertAnalyzesTo(cz, "písním", new String[] { "písn" });
+    
+    /* ending with e */
+    assertAnalyzesTo(cz, "růže", new String[] { "růh" });
+    assertAnalyzesTo(cz, "růží", new String[] { "růh" });
+    assertAnalyzesTo(cz, "růžím", new String[] { "růh" });
+    assertAnalyzesTo(cz, "růžích", new String[] { "růh" });
+    assertAnalyzesTo(cz, "růžemi", new String[] { "růh" });
+    assertAnalyzesTo(cz, "růži", new String[] { "růh" });
+    
+    /* ending with a */
+    assertAnalyzesTo(cz, "žena", new String[] { "žn" });
+    assertAnalyzesTo(cz, "ženy", new String[] { "žn" });
+    assertAnalyzesTo(cz, "žen", new String[] { "žn" });
+    assertAnalyzesTo(cz, "ženě", new String[] { "žn" });
+    assertAnalyzesTo(cz, "ženám", new String[] { "žn" });
+    assertAnalyzesTo(cz, "ženu", new String[] { "žn" });
+    assertAnalyzesTo(cz, "ženo", new String[] { "žn" });
+    assertAnalyzesTo(cz, "ženách", new String[] { "žn" });
+    assertAnalyzesTo(cz, "ženou", new String[] { "žn" });
+    assertAnalyzesTo(cz, "ženami", new String[] { "žn" });
+  }
+
+  /**
+   * Test showing how neuter noun forms conflate
+   */
+  public void testNeuterNouns() throws IOException {
+    CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
+    
+    /* ending with o */
+    assertAnalyzesTo(cz, "město", new String[] { "měst" });
+    assertAnalyzesTo(cz, "města", new String[] { "měst" });
+    assertAnalyzesTo(cz, "měst", new String[] { "měst" });
+    assertAnalyzesTo(cz, "městu", new String[] { "měst" });
+    assertAnalyzesTo(cz, "městům", new String[] { "měst" });
+    assertAnalyzesTo(cz, "městě", new String[] { "měst" });
+    assertAnalyzesTo(cz, "městech", new String[] { "měst" });
+    assertAnalyzesTo(cz, "městem", new String[] { "měst" });
+    assertAnalyzesTo(cz, "městy", new String[] { "měst" });
+    
+    /* ending with e */
+    assertAnalyzesTo(cz, "moře", new String[] { "moř" });
+    assertAnalyzesTo(cz, "moří", new String[] { "moř" });
+    assertAnalyzesTo(cz, "mořím", new String[] { "moř" });
+    assertAnalyzesTo(cz, "moři", new String[] { "moř" });
+    assertAnalyzesTo(cz, "mořích", new String[] { "moř" });
+    assertAnalyzesTo(cz, "mořem", new String[] { "moř" });
+
+    /* ending with ě */
+    assertAnalyzesTo(cz, "kuře", new String[] { "kuř" });
+    assertAnalyzesTo(cz, "kuřata", new String[] { "kuř" });
+    assertAnalyzesTo(cz, "kuřete", new String[] { "kuř" });
+    assertAnalyzesTo(cz, "kuřat", new String[] { "kuř" });
+    assertAnalyzesTo(cz, "kuřeti", new String[] { "kuř" });
+    assertAnalyzesTo(cz, "kuřatům", new String[] { "kuř" });
+    assertAnalyzesTo(cz, "kuřatech", new String[] { "kuř" });
+    assertAnalyzesTo(cz, "kuřetem", new String[] { "kuř" });
+    assertAnalyzesTo(cz, "kuřaty", new String[] { "kuř" });
+    
+    /* ending with í */
+    assertAnalyzesTo(cz, "stavení", new String[] { "stavn" });
+    assertAnalyzesTo(cz, "stavením", new String[] { "stavn" });
+    assertAnalyzesTo(cz, "staveních", new String[] { "stavn" });
+    assertAnalyzesTo(cz, "staveními", new String[] { "stavn" });    
+  }
+  
+  /**
+   * Test showing how adjectival forms conflate
+   */
+  public void testAdjectives() throws IOException {
+    CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
+    
+    /* ending with ý/á/é */
+    assertAnalyzesTo(cz, "mladý", new String[] { "mlad" });
+    assertAnalyzesTo(cz, "mladí", new String[] { "mlad" });
+    assertAnalyzesTo(cz, "mladého", new String[] { "mlad" });
+    assertAnalyzesTo(cz, "mladých", new String[] { "mlad" });
+    assertAnalyzesTo(cz, "mladému", new String[] { "mlad" });
+    assertAnalyzesTo(cz, "mladým", new String[] { "mlad" });
+    assertAnalyzesTo(cz, "mladé", new String[] { "mlad" });
+    assertAnalyzesTo(cz, "mladém", new String[] { "mlad" });
+    assertAnalyzesTo(cz, "mladými", new String[] { "mlad" }); 
+    assertAnalyzesTo(cz, "mladá", new String[] { "mlad" });
+    assertAnalyzesTo(cz, "mladou", new String[] { "mlad" });
+
+    /* ending with í */
+    assertAnalyzesTo(cz, "jarní", new String[] { "jarn" });
+    assertAnalyzesTo(cz, "jarního", new String[] { "jarn" });
+    assertAnalyzesTo(cz, "jarních", new String[] { "jarn" });
+    assertAnalyzesTo(cz, "jarnímu", new String[] { "jarn" });
+    assertAnalyzesTo(cz, "jarním", new String[] { "jarn" });
+    assertAnalyzesTo(cz, "jarními", new String[] { "jarn" });  
+  }
+  
+  /**
+   * Test some possessive suffixes
+   */
+  public void testPossessive() throws IOException {
+    CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
+    assertAnalyzesTo(cz, "Karlův", new String[] { "karl" });
+    assertAnalyzesTo(cz, "jazykový", new String[] { "jazyk" });
+  }
+  
+  /**
+   * Test some exceptional rules, implemented as rewrites.
+   */
+  public void testExceptions() throws IOException {
+    CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
+    
+    /* rewrite of Å¡t -> sk */
+    assertAnalyzesTo(cz, "český", new String[] { "česk" });
+    assertAnalyzesTo(cz, "čeští", new String[] { "česk" });
+    
+    /* rewrite of čt -> ck */
+    assertAnalyzesTo(cz, "anglický", new String[] { "anglick" });
+    assertAnalyzesTo(cz, "angličtí", new String[] { "anglick" });
+    
+    /* rewrite of z -> h */
+    assertAnalyzesTo(cz, "kniha", new String[] { "knih" });
+    assertAnalyzesTo(cz, "knize", new String[] { "knih" });
+    
+    /* rewrite of ž -> h */
+    assertAnalyzesTo(cz, "mazat", new String[] { "mah" });
+    assertAnalyzesTo(cz, "mažu", new String[] { "mah" });
+    
+    /* rewrite of c -> k */
+    assertAnalyzesTo(cz, "kluk", new String[] { "kluk" });
+    assertAnalyzesTo(cz, "kluci", new String[] { "kluk" });
+    assertAnalyzesTo(cz, "klucích", new String[] { "kluk" });
+    
+    /* rewrite of č -> k */
+    assertAnalyzesTo(cz, "hezký", new String[] { "hezk" });
+    assertAnalyzesTo(cz, "hezčí", new String[] { "hezk" });
+    
+    /* rewrite of *ů* -> *o* */
+    assertAnalyzesTo(cz, "hůl", new String[] { "hol" });
+    assertAnalyzesTo(cz, "hole", new String[] { "hol" });
+    
+    /* rewrite of e* -> * */
+    assertAnalyzesTo(cz, "deska", new String[] { "desk" });
+    assertAnalyzesTo(cz, "desek", new String[] { "desk" });
+  }
+  
+  /**
+   * Test that very short words are not stemmed.
+   */
+  public void testDontStem() throws IOException {
+    CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
+    assertAnalyzesTo(cz, "e", new String[] { "e" });
+    assertAnalyzesTo(cz, "zi", new String[] { "zi" });
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java
------------------------------------------------------------------------------
    svn:eol-style = native



Mime
View raw message