lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r825110 - in /lucene/java/trunk/contrib: CHANGES.txt analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java analyzers/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt
Date Wed, 14 Oct 2009 12:24:19 GMT
Author: rmuir
Date: Wed Oct 14 12:24:18 2009
New Revision: 825110

URL: http://svn.apache.org/viewvc?rev=825110&view=rev
Log:
LUCENE-1966: ArabicAnalyzer stopwords cleanup

Modified:
    lucene/java/trunk/contrib/CHANGES.txt
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt

Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=825110&r1=825109&r2=825110&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Wed Oct 14 12:24:18 2009
@@ -10,6 +10,10 @@
 
 Changes in runtime behavior
 
+ * LUCENE-1966: Modified and cleaned the default Arabic stopwords list used
+   by ArabicAnalyzer. You'll need to fully re-index any previously created 
+   indexes.  (Basem Narmok via Robert Muir)
+
  * LUCENE-1963: ArabicAnalyzer now lowercases before checking the stopword
    list. This has no effect on Arabic text, but if you are using a custom
    stopword list that contains some non-Arabic words, you'll need to fully

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java?rev=825110&r1=825109&r2=825110&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
Wed Oct 14 12:24:18 2009
@@ -148,6 +148,7 @@
   public final TokenStream tokenStream(String fieldName, Reader reader) {
     TokenStream result = new ArabicLetterTokenizer( reader );
     result = new LowerCaseFilter(result);
+    // the order here is important: the stopword list is not normalized!
     result = new StopFilter( result, stoptable );
     result = new ArabicNormalizationFilter( result );
     result = new ArabicStemFilter( result );
@@ -175,6 +176,7 @@
       streams = new SavedStreams();
       streams.source = new ArabicLetterTokenizer(reader);
       streams.result = new LowerCaseFilter(streams.source);
+      // the order here is important: the stopword list is not normalized!
       streams.result = new StopFilter(streams.result, stoptable);
       streams.result = new ArabicNormalizationFilter(streams.result);
       streams.result = new ArabicStemFilter(streams.result);

Modified: lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt?rev=825110&r1=825109&r2=825110&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt
Wed Oct 14 12:24:18 2009
@@ -1,161 +1,123 @@
 # This file was created by Jacques Savoy and is distributed under the BSD license.
 # See http://members.unine.ch/jacques.savoy/clef/index.html.
 # Also see http://www.opensource.org/licenses/bsd-license.html
+# Cleaned on October 11, 2009 (not normalized, so use before normalization)
+من
+ومن
+منها
+منه
+في
+وفي
+فيها
+فيه
+و
+ف
+ثم
+او
+أو
 ب
+بها
+به
 ا
 أ
-،
-عبد
-عدم
-عام
-عاما
+اى
+اي
+أي
+أى
+لا
+ولا
+الا
+ألا
+إلا
+لكن
+ما
+وما
+كما
+فما
 عن
-عند
-عندما
-على
-عليه
-عليها
-تم
-ضد
-بعد
-بعض
-اعادة
-اعلن
-اعلنت
-بسبب
-حتى
-اتفاق
+مع
 اذا
-احد
-اثر
-اجتماع
-اطار
-اربعة
-اخرى
-بان
-ابو
-اجل
-غير
-اطلاق
-بشكل
-حاليا
-بن
-به
-ثم
-اف
+إذا
 ان
-او
-اي
-بها
-جهة
-حيث
-اكد
-الا
-اما
-العام
-السابق
-السلام
-التعاون
+أن
+إن
+انها
+أنها
+إنها
+انه
+أنه
+إنه
+بان
+بأن
+فان
+فأن
+وان
+وأن
+وإن
 التى
 التي
-اكثر
-ايضا
-الذاتي
-الاخيرة
-الثاني
-الثانية
 الذى
 الذي
-الان
-خلال
-حوالى
 الذين
-الحكم
-الاول
-الاولى
-بين
-ذلك
 الى
-انه
-ضمن
-انها
-جميع
-الماضي
-المقبل
-ف
-و
-و6
-قد
-لا
-ما
-مع
-هذا
-واضاف
-واضافت
-فان
-قبل
-كان
-لدى
-نحو
-هذه
-وان
-يذكر
-كانت
-واوضح
-فى
-في
+الي
+إلى
+إلي
+على
+عليها
+عليه
+اما
+أما
+إما
+ايضا
+أيضا
 كل
+وكل
 لم
+ولم
 لن
-له
-من
-هو
+ولن
+هى
 هي
-كما
+هو
+وهى
+وهي
+وهو
+فهى
+فهي
+فهو
+انت
+أنت
+لك
 لها
-منذ
-ولا
-مقابل
+له
+هذه
+هذا
+تلك
+ذلك
 هناك
-وكان
-وكانت
-فيه
-لكن
-وفي
-ولم
-ومن
-وهو
-وهي
-فيها
-منها
+كانت
+كان
 يكون
-أخرى
-إذا
-أربعة
-إطار
-إعادة
-أعلن
-أعلنت
-أف
-أكثر
-أكد
-إلا
-الأخيرة
+تكون
+وكانت
+وكان
+غير
+بعض
+قد
+نحو
+بين
+بينما
+منذ
+ضمن
+حيث
+الان
 الآن
-الأول
-الأولى
-إلى
-أما
-أن
-إن
-إنه
-أنه
-أنها
-إنها
-أو
-اى
-أي
-أى
-أيضا
-بأن
-فإن
+خلال
+بعد
+قبل
+حتى
+عند
+عندما
+لدى
+جميع



Mime
View raw message