lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r825331 - in /lucene/java/branches/lucene_2_9/contrib: CHANGES.txt analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
Date Wed, 14 Oct 2009 23:00:53 GMT
Author: mikemccand
Date: Wed Oct 14 23:00:53 2009
New Revision: 825331

URL: http://svn.apache.org/viewvc?rev=825331&view=rev
Log:
LUCENE-1963 (on 2.9 branch): lowercase before stopwords in ArabicAnalyzer

Modified:
    lucene/java/branches/lucene_2_9/contrib/CHANGES.txt
    lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
    lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java

Modified: lucene/java/branches/lucene_2_9/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_2_9/contrib/CHANGES.txt?rev=825331&r1=825330&r2=825331&view=diff
==============================================================================
--- lucene/java/branches/lucene_2_9/contrib/CHANGES.txt (original)
+++ lucene/java/branches/lucene_2_9/contrib/CHANGES.txt Wed Oct 14 23:00:53 2009
@@ -3,7 +3,14 @@
 ======================= 2.9 Branch (not yet released) =======================
 
 Changes in backwards compatibility policy
-   
+
+Changes in runtime behavior
+
+ * LUCENE-1963: ArabicAnalyzer now lowercases before checking the stopword
+   list. This has no effect on Arabic text, but if you are using a custom
+   stopword list that contains some non-Arabic words, you'll need to fully
+   reindex.  (DM Smith via Robert Muir)
+      
 Bug fixes
 
  * LUCENE-1953: FastVectorHighlighter: small fragCharSize can cause

Modified: lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java?rev=825331&r1=825330&r2=825331&view=diff
==============================================================================
--- lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
(original)
+++ lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
Wed Oct 14 23:00:53 2009
@@ -111,13 +111,13 @@
    * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
    *
    * @return  A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered
with
-   * 			{@link StopFilter}, {@link LowerCaseFilter}, {@link ArabicNormalizationFilter}
+   * 			{@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter}
    *            and {@link ArabicStemFilter}.
    */
   public final TokenStream tokenStream(String fieldName, Reader reader) {
     TokenStream result = new ArabicLetterTokenizer( reader );
-    result = new StopFilter( result, stoptable );
     result = new LowerCaseFilter(result);
+    result = new StopFilter( result, stoptable );
     result = new ArabicNormalizationFilter( result );
     result = new ArabicStemFilter( result );
 
@@ -134,7 +134,7 @@
    * in the provided {@link Reader}.
    *
    * @return  A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered
with
-   *            {@link StopFilter}, {@link LowerCaseFilter}, {@link ArabicNormalizationFilter}
+   *            {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter}
    *            and {@link ArabicStemFilter}.
    */
   public TokenStream reusableTokenStream(String fieldName, Reader reader)
@@ -143,8 +143,8 @@
     if (streams == null) {
       streams = new SavedStreams();
       streams.source = new ArabicLetterTokenizer(reader);
-      streams.result = new StopFilter(streams.source, stoptable);
-      streams.result = new LowerCaseFilter(streams.result);
+      streams.result = new LowerCaseFilter(streams.source);
+      streams.result = new StopFilter(streams.result, stoptable);
       streams.result = new ArabicNormalizationFilter(streams.result);
       streams.result = new ArabicStemFilter(streams.result);
       setPreviousTokenStream(streams);

Modified: lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java?rev=825331&r1=825330&r2=825331&view=diff
==============================================================================
--- lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
(original)
+++ lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
Wed Oct 14 23:00:53 2009
@@ -72,4 +72,13 @@
     assertAnalyzesTo(new ArabicAnalyzer(), "English text.", new String[] {
         "english", "text" });
   }
+  
+  /**
+   * Test that custom stopwords work, and are not case-sensitive.
+   */
+  public void testCustomStopwords() throws Exception {
+    ArabicAnalyzer a = new ArabicAnalyzer(new String[] { "the", "and", "a" });
+    assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick",
+        "brown", "fox" });
+  }
 }



Mime
View raw message