lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r802955 - in /lucene/java/trunk/contrib: ./ analyzers/common/src/java/org/apache/lucene/analysis/fa/ analyzers/common/src/resources/org/apache/lucene/analysis/fa/ analyzers/common/src/test/org/apache/lucene/analysis/fa/
Date Mon, 10 Aug 2009 23:29:27 GMT
Author: rmuir
Date: Mon Aug 10 23:29:27 2009
New Revision: 802955

URL: http://svn.apache.org/viewvc?rev=802955&view=rev
Log:
LUCENE-1628: Add Persian Analyzer

Added:
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
  (with props)
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilter.java
  (with props)
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizer.java
  (with props)
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/package.html
  (with props)
    lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/fa/
    lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt
  (with props)
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java
  (with props)
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilter.java
  (with props)
Modified:
    lucene/java/trunk/contrib/CHANGES.txt

Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=802955&r1=802954&r2=802955&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Mon Aug 10 23:29:27 2009
@@ -142,6 +142,7 @@
 
 15. LUCENE-1406: Added Arabic analyzer.  (Robert Muir via Grant Ingersoll)
 
+16. LUCENE-1628: Added Persian analyzer.  (Robert Muir)
 
 Optimizations
 

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java?rev=802955&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
(added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
Mon Aug 10 23:29:27 2009
@@ -0,0 +1,165 @@
+package org.apache.lucene.analysis.fa;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.HashSet;
+import java.util.Hashtable;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
+import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
+
+/**
+ * Analyzer for Persian.
+ * 
+ * Analyzer uses {@link ArabicLetterTokenizer} which implies tokenizing around
+ * ZWNJ in addition to space. Some persian-specific variant forms (such as farsi
+ * yeh and keheh) are standardized. "Stemming" is accomplished via stopwords.
+ * 
+ */
+public final class PersianAnalyzer extends Analyzer {
+
+  /**
+   * File containing default Persian stopwords.
+   * 
+   * Default stopword list is from
+   * http://members.unine.ch/jacques.savoy/clef/index.html The stopword list is
+   * BSD-Licensed.
+   * 
+   */
+  public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
+  /**
+   * Contains the stopwords used with the StopFilter.
+   */
+  private Set stoptable = new HashSet();
+
+  /**
+   * The comment character in the stopwords file. All lines prefixed with this
+   * will be ignored
+   */
+  public static final String STOPWORDS_COMMENT = "#";
+
+  /**
+   * Builds an analyzer with the default stop words:
+   * {@link #DEFAULT_STOPWORD_FILE}.
+   */
+  public PersianAnalyzer() {
+    try {
+      InputStream stream = PersianAnalyzer.class
+          .getResourceAsStream(DEFAULT_STOPWORD_FILE);
+      InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
+      stoptable = WordlistLoader.getWordSet(reader, STOPWORDS_COMMENT);
+      reader.close();
+      stream.close();
+    } catch (IOException e) {
+      // TODO: throw IOException
+      throw new RuntimeException(e);
+    }
+  }
+
+  /**
+   * Builds an analyzer with the given stop words.
+   */
+  public PersianAnalyzer(String[] stopwords) {
+    stoptable = StopFilter.makeStopSet(stopwords);
+  }
+
+  /**
+   * Builds an analyzer with the given stop words.
+   */
+  public PersianAnalyzer(Hashtable stopwords) {
+    stoptable = new HashSet(stopwords.keySet());
+  }
+
+  /**
+   * Builds an analyzer with the given stop words. Lines can be commented out
+   * using {@link #STOPWORDS_COMMENT}
+   */
+  public PersianAnalyzer(File stopwords) throws IOException {
+    stoptable = WordlistLoader.getWordSet(stopwords, STOPWORDS_COMMENT);
+  }
+
+  /**
+   * Creates a TokenStream which tokenizes all the text in the provided Reader.
+   * 
+   * @return A TokenStream build from a ArabicLetterTokenizer filtered with
+   *         LowerCaseFilter, ArabicNormalizationFilter,
+   *         PersianNormalizationFilter and Persian Stop words
+   */
+  public TokenStream tokenStream(String fieldName, Reader reader) {
+    TokenStream result = new ArabicLetterTokenizer(reader);
+    result = new LowerCaseFilter(result);
+    result = new ArabicNormalizationFilter(result);
+    /* additional persian-specific normalization */
+    result = new PersianNormalizationFilter(result);
+    /*
+     * the order here is important: the stopword list is normalized with the
+     * above!
+     */
+    result = new StopFilter(result, stoptable);
+
+    return result;
+  }
+  
+  private class SavedStreams {
+    Tokenizer source;
+    TokenStream result;
+  }
+
+  /**
+   * Returns a (possibly reused) TokenStream which tokenizes all the text 
+   * in the provided Reader.
+   * 
+   * @return A TokenStream build from a ArabicLetterTokenizer filtered with
+   *         LowerCaseFilter, ArabicNormalizationFilter,
+   *         PersianNormalizationFilter and Persian Stop words
+   */
+  public TokenStream reusableTokenStream(String fieldName, Reader reader)
+      throws IOException {
+    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
+    if (streams == null) {
+      streams = new SavedStreams();
+      streams.source = new ArabicLetterTokenizer(reader);
+      streams.result = new LowerCaseFilter(streams.source);
+      streams.result = new ArabicNormalizationFilter(streams.result);
+      /* additional persian-specific normalization */
+      streams.result = new PersianNormalizationFilter(streams.result);
+      /*
+       * the order here is important: the stopword list is normalized with the
+       * above!
+       */
+      streams.result = new StopFilter(streams.result, stoptable);
+      setPreviousTokenStream(streams);
+    } else {
+      streams.source.reset(reader);
+    }
+    return streams.result;
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilter.java?rev=802955&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilter.java
(added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilter.java
Mon Aug 10 23:29:27 2009
@@ -0,0 +1,53 @@
+package org.apache.lucene.analysis.fa;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+/**
+ * A TokenFilter that applies {@link PersianNormalizer} to normalize the
+ * orthography.
+ * 
+ */
+
+public final class PersianNormalizationFilter extends TokenFilter {
+
+  private final PersianNormalizer normalizer;
+  private TermAttribute termAtt;
+
+  public PersianNormalizationFilter(TokenStream input) {
+    super(input);
+    normalizer = new PersianNormalizer();
+    termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+  }
+
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      int newlen = normalizer.normalize(termAtt.termBuffer(), termAtt
+          .termLength());
+      termAtt.setTermLength(newlen);
+      return true;
+    } else {
+      return false;
+    }
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizer.java?rev=802955&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizer.java
(added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizer.java
Mon Aug 10 23:29:27 2009
@@ -0,0 +1,95 @@
+package org.apache.lucene.analysis.fa;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Normalizer for Persian.
+ * <p>
+ * Normalization is done in-place for efficiency, operating on a termbuffer.
+ * <p>
+ * Normalization is defined as:
+ * <ul>
+ * <li>Normalization of various heh + hamza forms and heh goal to heh.
+ * <li>Normalization of farsi yeh and yeh barree to arabic yeh
+ * <li>Normalization of persian keheh to arabic kaf
+ * </ul>
+ * 
+ */
+public class PersianNormalizer {
+  public static final char YEH = '\u064A';
+
+  public static final char FARSI_YEH = '\u06CC';
+
+  public static final char YEH_BARREE = '\u06D2';
+
+  public static final char KEHEH = '\u06A9';
+
+  public static final char KAF = '\u0643';
+
+  public static final char HAMZA_ABOVE = '\u0654';
+
+  public static final char HEH_YEH = '\u06C0';
+
+  public static final char HEH_GOAL = '\u06C1';
+
+  public static final char HEH = '\u0647';
+
+  /**
+   * Normalize an input buffer of Persian text
+   * 
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return length of input buffer after normalization
+   */
+  public int normalize(char s[], int len) {
+
+    for (int i = 0; i < len; i++) {
+      if (s[i] == FARSI_YEH || s[i] == YEH_BARREE)
+        s[i] = YEH;
+
+      if (s[i] == KEHEH)
+        s[i] = KAF;
+
+      if (s[i] == HEH_YEH || s[i] == HEH_GOAL)
+        s[i] = HEH;
+
+      if (s[i] == HAMZA_ABOVE) { // necessary for HEH + HAMZA
+        len = delete(s, i, len);
+        i--;
+      }
+    }
+
+    return len;
+  }
+
+  /**
+   * Delete a character in-place
+   * 
+   * @param s Input Buffer
+   * @param pos Position of character to delete
+   * @param len length of input buffer
+   * @return length of input buffer after deletion
+   */
+  protected int delete(char s[], int pos, int len) {
+    if (pos < len)
+      System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
+
+    return len - 1;
+  }
+
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/package.html?rev=802955&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/package.html
(added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/package.html
Mon Aug 10 23:29:27 2009
@@ -0,0 +1,5 @@
+<html><head></head>
+<body>
+Analyzer for Persian.
+</body>
+</html>

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/package.html
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt?rev=802955&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt
(added)
+++ lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt
Mon Aug 10 23:29:27 2009
@@ -0,0 +1,311 @@
+# This file was created by Jacques Savoy and is distributed under the BSD license.
+# See http://members.unine.ch/jacques.savoy/clef/index.html.
+# Also see http://www.opensource.org/licenses/bsd-license.html
+انان
+نداشته
+سراسر
+خياه
+ايشان
+وي
+تاكنون
+بيشتري
+دوم
+پس
+ناشي
+وگو
+يا
+داشتند
+سپس
+هنگام
+هرگز
+پنج
+نشان
+امسال
+ديگر
+گروهي
+شدند
+چطور
+ده
+و
+دو
+نخستين
+ولي
+چرا
+چه
+وسط
+ه
+كدام
+قابل
+يك
+رفت
+هفت
+همچنين
+در
+هزار
+بله
+بلي
+شايد
+اما
+شناسي
+گرفته
+دهد
+داشته
+دانست
+داشتن
+خواهيم
+ميليارد
+وقتيكه
+امد
+خواهد
+جز
+اورده
+شده
+بلكه
+خدمات
+شدن
+برخي
+نبود
+بسياري
+جلوگيري
+حق
+كردند
+نوعي
+بعري
+نكرده
+نظير
+نبايد
+بوده
+بودن
+داد
+اورد
+هست
+جايي
+شود
+دنبال
+داده
+بايد
+سابق
+هيچ
+همان
+انجا
+كمتر
+كجاست
+گردد
+كسي
+تر
+مردم
+تان
+دادن
+بودند
+سري
+جدا
+ندارند
+مگر
+يكديگر
+دارد
+دهند
+بنابراين
+هنگامي
+سمت
+جا
+انچه
+خود
+دادند
+زياد
+دارند
+اثر
+بدون
+بهترين
+بيشتر
+البته
+به
+براساس
+بيرون
+كرد
+بعضي
+گرفت
+توي
+اي
+ميليون
+او
+جريان
+تول
+بر
+مانند
+برابر
+باشيم
+مدتي
+گويند
+اكنون
+تا
+تنها
+جديد
+چند
+بي
+نشده
+كردن
+كردم
+گويد
+كرده
+كنيم
+نمي
+نزد
+روي
+قصد
+فقط
+بالاي
+ديگران
+اين
+ديروز
+توسط
+سوم
+ايم
+دانند
+سوي
+استفاده
+شما
+كنار
+داريم
+ساخته
+طور
+امده
+رفته
+نخست
+بيست
+نزديك
+طي
+كنيد
+از
+انها
+تمامي
+داشت
+يكي
+طريق
+اش
+چيست
+روب
+نمايد
+گفت
+چندين
+چيزي
+تواند
+ام
+ايا
+با
+ان
+ايد
+ترين
+اينكه
+ديگري
+راه
+هايي
+بروز
+همچنان
+پاعين
+كس
+حدود
+مختلف
+مقابل
+چيز
+گيرد
+ندارد
+ضد
+همچون
+سازي
+شان
+مورد
+باره
+مرسي
+خويش
+برخوردار
+چون
+خارج
+شش
+هنوز
+تحت
+ضمن
+هستيم
+گفته
+فكر
+بسيار
+پيش
+براي
+روزهاي
+انكه
+نخواهد
+بالا
+كل
+وقتي
+كي
+چنين
+كه
+گيري
+نيست
+است
+كجا
+كند
+نيز
+يابد
+بندي
+حتي
+توانند
+عقب
+خواست
+كنند
+بين
+تمام
+همه
+ما
+باشند
+مثل
+شد
+اري
+باشد
+اره
+طبق
+بعد
+اگر
+صورت
+غير
+جاي
+بيش
+ريزي
+اند
+زيرا
+چگونه
+بار
+لطفا
+مي
+درباره
+من
+ديده
+همين
+گذاري
+برداري
+علت
+گذاشته
+هم
+فوق
+نه
+ها
+شوند
+اباد
+همواره
+هر
+اول
+خواهند
+چهار
+نام
+امروز
+مان
+هاي
+قبل
+كنم
+سعي
+تازه
+را
+هستند
+زير
+جلوي
+عنوان
+بود

Propchange: lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java?rev=802955&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java
(added)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java
Mon Aug 10 23:29:27 2009
@@ -0,0 +1,248 @@
+package org.apache.lucene.analysis.fa;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.StringReader;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+/**
+ * Test the Persian Analyzer
+ * 
+ */
+public class TestPersianAnalyzer extends TestCase {
+
+  /**
+   * This test fails with NPE when the stopwords file is missing in classpath
+   */
+  public void testResourcesAvailable() {
+    new PersianAnalyzer();
+  }
+
+  /**
+   * This test shows how the combination of tokenization (breaking on zero-width
+   * non-joiner), normalization (such as treating arabic YEH and farsi YEH the
+   * same), and stopwords creates a light-stemming effect for verbs.
+   * 
+   * These verb forms are from http://en.wikipedia.org/wiki/Persian_grammar
+   */
+  public void testBehaviorVerbs() throws Exception {
+    Analyzer a = new PersianAnalyzer();
+    // active present indicative
+    assertAnalyzesTo(a, "می‌خورد", new String[] { "خورد"
});
+    // active preterite indicative
+    assertAnalyzesTo(a, "خورد", new String[] { "خورد" });
+    // active imperfective preterite indicative
+    assertAnalyzesTo(a, "می‌خورد", new String[] { "خورد"
});
+    // active future indicative
+    assertAnalyzesTo(a, "خواهد خورد", new String[] { "خورد"
});
+    // active present progressive indicative
+    assertAnalyzesTo(a, "دارد می‌خورد", new String[] {
"خورد" });
+    // active preterite progressive indicative
+    assertAnalyzesTo(a, "داشت می‌خورد", new String[] {
"خورد" });
+
+    // active perfect indicative
+    assertAnalyzesTo(a, "خورده‌است", new String[] { "خورده"
});
+    // active imperfective perfect indicative
+    assertAnalyzesTo(a, "می‌خورده‌است", new String[]
{ "خورده" });
+    // active pluperfect indicative
+    assertAnalyzesTo(a, "خورده بود", new String[] { "خورده"
});
+    // active imperfective pluperfect indicative
+    assertAnalyzesTo(a, "می‌خورده بود", new String[] {
"خورده" });
+    // active preterite subjunctive
+    assertAnalyzesTo(a, "خورده باشد", new String[] { "خورده"
});
+    // active imperfective preterite subjunctive
+    assertAnalyzesTo(a, "می‌خورده باشد", new String[]
{ "خورده" });
+    // active pluperfect subjunctive
+    assertAnalyzesTo(a, "خورده بوده باشد", new String[]
{ "خورده" });
+    // active imperfective pluperfect subjunctive
+    assertAnalyzesTo(a, "می‌خورده بوده باشد",
new String[] { "خورده" });
+    // passive present indicative
+    assertAnalyzesTo(a, "خورده می‌شود", new String[] {
"خورده" });
+    // passive preterite indicative
+    assertAnalyzesTo(a, "خورده شد", new String[] { "خورده"
});
+    // passive imperfective preterite indicative
+    assertAnalyzesTo(a, "خورده می‌شد", new String[] { "خورده"
});
+    // passive perfect indicative
+    assertAnalyzesTo(a, "خورده شده‌است", new String[]
{ "خورده" });
+    // passive imperfective perfect indicative
+    assertAnalyzesTo(a, "خورده می‌شده‌است",
new String[] { "خورده" });
+    // passive pluperfect indicative
+    assertAnalyzesTo(a, "خورده شده بود", new String[] {
"خورده" });
+    // passive imperfective pluperfect indicative
+    assertAnalyzesTo(a, "خورده می‌شده بود", new
String[] { "خورده" });
+    // passive future indicative
+    assertAnalyzesTo(a, "خورده خواهد شد", new String[]
{ "خورده" });
+    // passive present progressive indicative
+    assertAnalyzesTo(a, "دارد خورده می‌شود",
new String[] { "خورده" });
+    // passive preterite progressive indicative
+    assertAnalyzesTo(a, "داشت خورده می‌شد", new
String[] { "خورده" });
+    // passive present subjunctive
+    assertAnalyzesTo(a, "خورده شود", new String[] { "خورده"
});
+    // passive preterite subjunctive
+    assertAnalyzesTo(a, "خورده شده باشد", new String[]
{ "خورده" });
+    // passive imperfective preterite subjunctive
+    assertAnalyzesTo(a, "خورده می‌شده باشد",
new String[] { "خورده" });
+    // passive pluperfect subjunctive
+    assertAnalyzesTo(a, "خورده شده بوده باشد",
new String[] { "خورده" });
+    // passive imperfective pluperfect subjunctive
+    assertAnalyzesTo(a, "خورده می‌شده بوده
باشد", new String[] { "خورده" });
+
+    // active present subjunctive
+    assertAnalyzesTo(a, "بخورد", new String[] { "بخورد" });
+  }
+
+  /**
+   * This test shows how the combination of tokenization and stopwords creates a
+   * light-stemming effect for verbs.
+   * 
+   * In this case, these forms are presented with alternative orthography, using
+   * arabic yeh and whitespace. This yeh phenomenon is common for legacy text
+   * due to some previous bugs in Microsoft Windows.
+   * 
+   * These verb forms are from http://en.wikipedia.org/wiki/Persian_grammar
+   */
+  public void testBehaviorVerbsDefective() throws Exception {
+    Analyzer a = new PersianAnalyzer();
+    // active present indicative
+    assertAnalyzesTo(a, "مي خورد", new String[] { "خورد" });
+    // active preterite indicative
+    assertAnalyzesTo(a, "خورد", new String[] { "خورد" });
+    // active imperfective preterite indicative
+    assertAnalyzesTo(a, "مي خورد", new String[] { "خورد" });
+    // active future indicative
+    assertAnalyzesTo(a, "خواهد خورد", new String[] { "خورد"
});
+    // active present progressive indicative
+    assertAnalyzesTo(a, "دارد مي خورد", new String[] { "خورد"
});
+    // active preterite progressive indicative
+    assertAnalyzesTo(a, "داشت مي خورد", new String[] { "خورد"
});
+
+    // active perfect indicative
+    assertAnalyzesTo(a, "خورده است", new String[] { "خورده"
});
+    // active imperfective perfect indicative
+    assertAnalyzesTo(a, "مي خورده است", new String[] { "خورده"
});
+    // active pluperfect indicative
+    assertAnalyzesTo(a, "خورده بود", new String[] { "خورده"
});
+    // active imperfective pluperfect indicative
+    assertAnalyzesTo(a, "مي خورده بود", new String[] { "خورده"
});
+    // active preterite subjunctive
+    assertAnalyzesTo(a, "خورده باشد", new String[] { "خورده"
});
+    // active imperfective preterite subjunctive
+    assertAnalyzesTo(a, "مي خورده باشد", new String[] {
"خورده" });
+    // active pluperfect subjunctive
+    assertAnalyzesTo(a, "خورده بوده باشد", new String[]
{ "خورده" });
+    // active imperfective pluperfect subjunctive
+    assertAnalyzesTo(a, "مي خورده بوده باشد",
new String[] { "خورده" });
+    // passive present indicative
+    assertAnalyzesTo(a, "خورده مي شود", new String[] { "خورده"
});
+    // passive preterite indicative
+    assertAnalyzesTo(a, "خورده شد", new String[] { "خورده"
});
+    // passive imperfective preterite indicative
+    assertAnalyzesTo(a, "خورده مي شد", new String[] { "خورده"
});
+    // passive perfect indicative
+    assertAnalyzesTo(a, "خورده شده است", new String[] {
"خورده" });
+    // passive imperfective perfect indicative
+    assertAnalyzesTo(a, "خورده مي شده است", new String[]
{ "خورده" });
+    // passive pluperfect indicative
+    assertAnalyzesTo(a, "خورده شده بود", new String[] {
"خورده" });
+    // passive imperfective pluperfect indicative
+    assertAnalyzesTo(a, "خورده مي شده بود", new String[]
{ "خورده" });
+    // passive future indicative
+    assertAnalyzesTo(a, "خورده خواهد شد", new String[]
{ "خورده" });
+    // passive present progressive indicative
+    assertAnalyzesTo(a, "دارد خورده مي شود", new
String[] { "خورده" });
+    // passive preterite progressive indicative
+    assertAnalyzesTo(a, "داشت خورده مي شد", new String[]
{ "خورده" });
+    // passive present subjunctive
+    assertAnalyzesTo(a, "خورده شود", new String[] { "خورده"
});
+    // passive preterite subjunctive
+    assertAnalyzesTo(a, "خورده شده باشد", new String[]
{ "خورده" });
+    // passive imperfective preterite subjunctive
+    assertAnalyzesTo(a, "خورده مي شده باشد", new
String[] { "خورده" });
+    // passive pluperfect subjunctive
+    assertAnalyzesTo(a, "خورده شده بوده باشد",
new String[] { "خورده" });
+    // passive imperfective pluperfect subjunctive
+    assertAnalyzesTo(a, "خورده مي شده بوده باشد",
new String[] { "خورده" });
+
+    // active present subjunctive
+    assertAnalyzesTo(a, "بخورد", new String[] { "بخورد" });
+  }
+
+  /**
+   * This test shows how the combination of tokenization (breaking on zero-width
+   * non-joiner or space) and stopwords creates a light-stemming effect for
+   * nouns, removing the plural -ha.
+   */
+  public void testBehaviorNouns() throws Exception {
+    Analyzer a = new PersianAnalyzer();
+    assertAnalyzesTo(a, "برگ ها", new String[] { "برگ" });
+    assertAnalyzesTo(a, "برگ‌ها", new String[] { "برگ" });
+  }
+
+  /**
+   * Test showing that non-persian text is treated very much like SimpleAnalyzer
+   * (lowercased, etc)
+   */
+  public void testBehaviorNonPersian() throws Exception {
+    Analyzer a = new PersianAnalyzer();
+    assertAnalyzesTo(a, "English test.", new String[] { "english", "test" });
+  }
+  
+  /**
+   * Basic test ensuring that reusableTokenStream works correctly.
+   */
+  public void testReusableTokenStream() throws Exception {
+    Analyzer a = new PersianAnalyzer();
+    assertAnalyzesToReuse(a, "خورده مي شده بوده
باشد", new String[] { "خورده" });
+    assertAnalyzesToReuse(a, "برگ‌ها", new String[] { "برگ"
});
+  }
+
+  private void assertAnalyzesTo(Analyzer a, String input, String[] output)
+      throws Exception {
+	TokenStream ts = a.tokenStream("dummy", new StringReader(input));
+	TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
+
+	for (int i = 0; i < output.length; i++) {
+		assertTrue(ts.incrementToken());
+		assertEquals(output[i], termAtt.term());
+	}
+	
+	assertFalse(ts.incrementToken());
+    ts.close();
+  }
+  
+  private void assertAnalyzesToReuse(Analyzer a, String input, String[] output)
+      throws Exception {
+    TokenStream ts = a.reusableTokenStream("dummy", new StringReader(input));
+    TermAttribute termAtt = (TermAttribute) ts
+        .getAttribute(TermAttribute.class);
+
+    for (int i = 0; i < output.length; i++) {
+      assertTrue(ts.incrementToken());
+      assertEquals(output[i], termAtt.term());
+    }
+
+    assertFalse(ts.incrementToken());
+    ts.close();
+  }
+
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilter.java?rev=802955&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilter.java
(added)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilter.java
Mon Aug 10 23:29:27 2009
@@ -0,0 +1,71 @@
+package org.apache.lucene.analysis.fa;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+/**
+ * Test the Arabic Normalization Filter
+ * 
+ */
+public class TestPersianNormalizationFilter extends TestCase {
+
+  public void testFarsiYeh() throws IOException {
+    check("های", "هاي");
+  }
+
+  public void testYehBarree() throws IOException {
+    check("هاے", "هاي");
+  }
+
+  public void testKeheh() throws IOException {
+    check("کشاندن", "كشاندن");
+  }
+
+  public void testHehYeh() throws IOException {
+    check("كتابۀ", "كتابه");
+  }
+
+  public void testHehHamzaAbove() throws IOException {
+    check("كتابهٔ", "كتابه");
+  }
+
+  public void testHehGoal() throws IOException {
+    check("زادہ", "زاده");
+  }
+
+  private void check(final String input, final String expected)
+      throws IOException {
+    ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(
+        new StringReader(input));
+    PersianNormalizationFilter filter = new PersianNormalizationFilter(
+        tokenStream);
+    TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class);
+    assertTrue(filter.incrementToken());
+    assertEquals(expected, termAtt.term());
+    assertFalse(filter.incrementToken());
+    filter.close();
+  }
+
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native



Mime
View raw message