lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sim...@apache.org
Subject svn commit: r888787 - in /lucene/java/trunk/contrib: ./ snowball/ snowball/src/java/org/apache/lucene/analysis/snowball/ snowball/src/test/org/apache/lucene/analysis/snowball/
Date Wed, 09 Dec 2009 12:47:37 GMT
Author: simonw
Date: Wed Dec  9 12:47:37 2009
New Revision: 888787

URL: http://svn.apache.org/viewvc?rev=888787&view=rev
Log:
LUCENE-2117: SnowballAnalyzer uses TurkishLowerCaseFilter instead of LowercaseFilter to correctly
handle the unique Turkish casing behavior if used with Version > 3.0 and the TurkishStemmer.

Modified:
    lucene/java/trunk/contrib/CHANGES.txt
    lucene/java/trunk/contrib/snowball/build.xml
    lucene/java/trunk/contrib/snowball/pom.xml.template
    lucene/java/trunk/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java
    lucene/java/trunk/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java
    lucene/java/trunk/contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java

Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=888787&r1=888786&r2=888787&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Wed Dec  9 12:47:37 2009
@@ -2,6 +2,13 @@
 
 ======================= Trunk (not yet released) =======================
 
+Changes in runtime behavior
+
+* LUCENE-2117: SnowballAnalyzer uses TurkishLowerCaseFilter instead of
+  LowercaseFilter to correctly handle the unique Turkish casing behavior if
+  used with Version > 3.0 and the TurkishStemmer.
+  (Robert Muir via Simon Willnauer)  
+
 Bug fixes
 
  * LUCENE-2068: Fixed ReverseStringFilter which was not aware of supplementary
@@ -39,6 +46,10 @@
 
 Build
 
+ * LUCENE-2117: SnowballAnalyzer now holds a runtime-dependency on
+   contrib-analyzers to correctly handle the unique Turkish casing behavior.
+   (Robert Muir via Simon Willnauer)  
+
  * LUCENE-2124: Moved the JDK-based collation support from contrib/collation 
    into core, and moved the ICU-based collation support into contrib/icu.  
    (Robert Muir)

Modified: lucene/java/trunk/contrib/snowball/build.xml
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/snowball/build.xml?rev=888787&r1=888786&r2=888787&view=diff
==============================================================================
--- lucene/java/trunk/contrib/snowball/build.xml (original)
+++ lucene/java/trunk/contrib/snowball/build.xml Wed Dec  9 12:47:37 2009
@@ -30,6 +30,15 @@
   <property name="snowball.root" value="snowball/website"/>
   <property name="bin.dir" location="bin"/>
 
+  <property name="analyzers.jar" location="${common.dir}/build/contrib/analyzers/common/lucene-analyzers-${version}.jar"/>
+  <available property="analyzers.jar.present" type="file" file="${analyzers.jar}"/>
+  
+  <path id="classpath">
+	<pathelement path="${lucene.jar}"/>
+	<pathelement path="${analyzers.jar}"/>
+	<pathelement path="${project.classpath}"/>
+  </path>
+
   <target name="jar" depends="compile" description="Create JAR">
     <jarify>
       <metainf-includes>
@@ -121,5 +130,11 @@
 
   </target>
 
+  <target name="compile-core" depends="build-analyzers, common.compile-core" />
+  
+  <target name="build-analyzers" unless="analyzers.jar.present">
+    <echo>Snowball building dependency ${analyzers.jar}</echo>
+    <ant antfile="../analyzers/build.xml" target="default" inheritall="false" dir="../analyzers"
/>
+  </target>
 
 </project>

Modified: lucene/java/trunk/contrib/snowball/pom.xml.template
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/snowball/pom.xml.template?rev=888787&r1=888786&r2=888787&view=diff
==============================================================================
--- lucene/java/trunk/contrib/snowball/pom.xml.template (original)
+++ lucene/java/trunk/contrib/snowball/pom.xml.template Wed Dec  9 12:47:37 2009
@@ -33,4 +33,11 @@
   <version>@version@</version>
   <description>Snowball Analyzers</description>
   <packaging>jar</packaging>
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.lucene</groupId>
+      <artifactId>lucene-analyzers</artifactId>
+      <version>@version@</version>
+    </dependency>
+  </dependencies>
 </project>

Modified: lucene/java/trunk/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java?rev=888787&r1=888786&r2=888787&view=diff
==============================================================================
--- lucene/java/trunk/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java
(original)
+++ lucene/java/trunk/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java
Wed Dec  9 12:47:37 2009
@@ -19,6 +19,7 @@
 
 import org.apache.lucene.analysis.*;
 import org.apache.lucene.analysis.standard.*;
+import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter;
 import org.apache.lucene.util.Version;
 
 import java.io.IOException;
@@ -33,7 +34,11 @@
  * {@link org.tartarus.snowball.ext.EnglishStemmer} is named "English".
  *
  * <p><b>NOTE</b>: This class uses the same {@link Version}
- * dependent settings as {@link StandardAnalyzer}.</p>
+ * dependent settings as {@link StandardAnalyzer}, with the following addition:
+ * <ul>
+ *   <li> As of 3.1, uses {@link TurkishLowerCaseFilter} for Turkish language.
+ * </ul>
+ * </p>
  */
 public class SnowballAnalyzer extends Analyzer {
   private String name;
@@ -60,7 +65,11 @@
   public TokenStream tokenStream(String fieldName, Reader reader) {
     TokenStream result = new StandardTokenizer(matchVersion, reader);
     result = new StandardFilter(result);
-    result = new LowerCaseFilter(matchVersion, result);
+    // Use a special lowercase filter for turkish, the stemmer expects it.
+    if (matchVersion.onOrAfter(Version.LUCENE_31) && name.equals("Turkish"))
+      result = new TurkishLowerCaseFilter(result);
+    else
+      result = new LowerCaseFilter(matchVersion, result);
     if (stopSet != null)
       result = new StopFilter(matchVersion,
                               result, stopSet);
@@ -91,7 +100,11 @@
       streams = new SavedStreams();
       streams.source = new StandardTokenizer(matchVersion, reader);
       streams.result = new StandardFilter(streams.source);
-      streams.result = new LowerCaseFilter(matchVersion, streams.result);
+      // Use a special lowercase filter for turkish, the stemmer expects it.
+      if (matchVersion.onOrAfter(Version.LUCENE_31) && name.equals("Turkish"))
+        streams.result = new TurkishLowerCaseFilter(streams.result);
+      else
+        streams.result = new LowerCaseFilter(matchVersion, streams.result);
       if (stopSet != null)
         streams.result = new StopFilter(matchVersion,
                                         streams.result, stopSet);

Modified: lucene/java/trunk/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java?rev=888787&r1=888786&r2=888787&view=diff
==============================================================================
--- lucene/java/trunk/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java
(original)
+++ lucene/java/trunk/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java
Wed Dec  9 12:47:37 2009
@@ -22,12 +22,20 @@
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter; // javadoc @link
+import org.apache.lucene.analysis.LowerCaseFilter; // javadoc @link
 import org.tartarus.snowball.SnowballProgram;
 
 /**
  * A filter that stems words using a Snowball-generated stemmer.
  *
  * Available stemmers are listed in {@link org.tartarus.snowball.ext}.
+ * <p><b>NOTE</b>: SnowballFilter expects lowercased text.
+ * <ul>
+ *  <li>For the Turkish language, see {@link TurkishLowerCaseFilter}.
+ *  <li>For other languages, see {@link LowerCaseFilter}.
+ * </ul>
+ * </p>
  */
 public final class SnowballFilter extends TokenFilter {
 

Modified: lucene/java/trunk/contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java?rev=888787&r1=888786&r2=888787&view=diff
==============================================================================
--- lucene/java/trunk/contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java
(original)
+++ lucene/java/trunk/contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java
Wed Dec  9 12:47:37 2009
@@ -18,7 +18,6 @@
  */
 
 import java.io.Reader;
-import java.io.StringReader;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Analyzer;
@@ -41,6 +40,44 @@
         new String[]{"he", "abhor", "accent"});
   }
 
+  /**
+   * Test english lowercasing. Test both cases (pre-3.1 and post-3.1) to ensure
+   * we lowercase I correct for non-Turkish languages in either case.
+   */
+  public void testEnglishLowerCase() throws Exception {
+    Analyzer a = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English");
+    assertAnalyzesTo(a, "cryogenic", new String[] { "cryogen" });
+    assertAnalyzesTo(a, "CRYOGENIC", new String[] { "cryogen" });
+    
+    Analyzer b = new SnowballAnalyzer(Version.LUCENE_30, "English");
+    assertAnalyzesTo(b, "cryogenic", new String[] { "cryogen" });
+    assertAnalyzesTo(b, "CRYOGENIC", new String[] { "cryogen" });
+  }
+  
+  /**
+   * Test turkish lowercasing
+   */
+  public void testTurkish() throws Exception {
+    Analyzer a = new SnowballAnalyzer(Version.LUCENE_CURRENT, "Turkish");
+
+    assertAnalyzesTo(a, "ağacı", new String[] { "ağaç" });
+    assertAnalyzesTo(a, "AĞACI", new String[] { "ağaç" });
+  }
+  
+  /**
+   * Test turkish lowercasing (old buggy behavior)
+   * @deprecated Remove this when support for 3.0 indexes is no longer required
+   */
+  public void testTurkishBWComp() throws Exception {
+    Analyzer a = new SnowballAnalyzer(Version.LUCENE_30, "Turkish");
+    // AĞACI in turkish lowercases to ağacı, but with lowercase filter ağaci.
+    // this fails due to wrong casing, because the stemmer
+    // will only remove -ı, not -i
+    assertAnalyzesTo(a, "ağacı", new String[] { "ağaç" });
+    assertAnalyzesTo(a, "AĞACI", new String[] { "ağaci" });
+  }
+
+  
   public void testReusableTokenStream() throws Exception {
     Analyzer a = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English");
     assertAnalyzesToReuse(a, "he abhorred accents",



Mime
View raw message