lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r823534 - in /lucene/java/trunk/contrib: CHANGES.txt analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
Date Fri, 09 Oct 2009 12:55:48 GMT
Author: rmuir
Date: Fri Oct  9 12:55:47 2009
New Revision: 823534

URL: http://svn.apache.org/viewvc?rev=823534&view=rev
Log:
LUCENE-1963: Lowercase before stopfilter in ArabicAnalyzer

Modified:
    lucene/java/trunk/contrib/CHANGES.txt
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java

Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=823534&r1=823533&r2=823534&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Fri Oct  9 12:55:47 2009
@@ -10,6 +10,11 @@
 
 Changes in runtime behavior
 
+ * LUCENE-1963: ArabicAnalyzer now lowercases before checking the stopword
+   list. This has no effect on Arabic text, but if you are using a custom
+   stopword list that contains some non-Arabic words, you'll need to fully
+   reindex.  (DM Smith via Robert Muir)
+
 API Changes
 
  * LUCENE-1936: Deprecated RussianLowerCaseFilter, because it transforms

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java?rev=823534&r1=823533&r2=823534&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
Fri Oct  9 12:55:47 2009
@@ -142,13 +142,13 @@
    * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
    *
    * @return  A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered
with
-   * 			{@link StopFilter}, {@link LowerCaseFilter}, {@link ArabicNormalizationFilter}
+   * 			{@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter}
    *            and {@link ArabicStemFilter}.
    */
   public final TokenStream tokenStream(String fieldName, Reader reader) {
     TokenStream result = new ArabicLetterTokenizer( reader );
-    result = new StopFilter( result, stoptable );
     result = new LowerCaseFilter(result);
+    result = new StopFilter( result, stoptable );
     result = new ArabicNormalizationFilter( result );
     result = new ArabicStemFilter( result );
 
@@ -165,7 +165,7 @@
    * in the provided {@link Reader}.
    *
    * @return  A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered
with
-   *            {@link StopFilter}, {@link LowerCaseFilter}, {@link ArabicNormalizationFilter}
+   *            {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter}
    *            and {@link ArabicStemFilter}.
    */
   public TokenStream reusableTokenStream(String fieldName, Reader reader)
@@ -174,8 +174,8 @@
     if (streams == null) {
       streams = new SavedStreams();
       streams.source = new ArabicLetterTokenizer(reader);
-      streams.result = new StopFilter(streams.source, stoptable);
-      streams.result = new LowerCaseFilter(streams.result);
+      streams.result = new LowerCaseFilter(streams.source);
+      streams.result = new StopFilter(streams.result, stoptable);
       streams.result = new ArabicNormalizationFilter(streams.result);
       streams.result = new ArabicStemFilter(streams.result);
       setPreviousTokenStream(streams);

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java?rev=823534&r1=823533&r2=823534&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
Fri Oct  9 12:55:47 2009
@@ -72,4 +72,13 @@
     assertAnalyzesTo(new ArabicAnalyzer(), "English text.", new String[] {
         "english", "text" });
   }
+  
+  /**
+   * Test that custom stopwords work, and are not case-sensitive.
+   */
+  public void testCustomStopwords() throws Exception {
+    ArabicAnalyzer a = new ArabicAnalyzer(new String[] { "the", "and", "a" });
+    assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick",
+        "brown", "fox" });
+  }
 }



Mime
View raw message