lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From gsing...@apache.org
Subject svn commit: r706342 - in /lucene/java/trunk: ./ contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ contrib/analyzers/src/test/org/apache/lucene/analysis/ar/ src/java/org/apache/lucene/analysis/ src/test/org/apache/lucene/index/
Date Mon, 20 Oct 2008 17:19:30 GMT
Author: gsingers
Date: Mon Oct 20 10:19:29 2008
New Revision: 706342

URL: http://svn.apache.org/viewvc?rev=706342&view=rev
Log:
LUCENE-1406.  Added Arabic stemming and normalization.  Also added new method to WordListLoader
to allow for comments in word lists.

Added:
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
  (with props)
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java
  (with props)
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java
  (with props)
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java
  (with props)
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java
  (with props)
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java
  (with props)
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/package.html
  (with props)
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/stopwords.txt
  (with props)
    lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ar/
    lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java
  (with props)
    lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java
  (with props)
Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/NOTICE.txt
    lucene/java/trunk/src/java/org/apache/lucene/analysis/WordlistLoader.java
    lucene/java/trunk/src/test/org/apache/lucene/index/TestWordlistLoader.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=706342&r1=706341&r2=706342&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Mon Oct 20 10:19:29 2008
@@ -34,6 +34,8 @@
     static methods.  (Shalin Shekhar Mangar via Mike McCandless)
 
 
+ 3. LUCENE-1406: Added Arabic analyzer.  (Robert Muir via Grant Ingersoll)
+
 Optimizations
 
 Documentation

Modified: lucene/java/trunk/NOTICE.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/NOTICE.txt?rev=706342&r1=706341&r2=706342&view=diff
==============================================================================
--- lucene/java/trunk/NOTICE.txt (original)
+++ lucene/java/trunk/NOTICE.txt Mon Oct 20 10:19:29 2008
@@ -9,3 +9,8 @@
 were developed by Martin Porter and Richard Boulton.
 The full snowball package is available from
   http://snowball.tartarus.org/
+
+The Arabic stemmer (contrib/analyzer) comes with a default
+stopword list that is BSD-licensed created by Jacques Savoy.  The file
+resides in contrib/analyzers/src/java/org/apache/lucene/analysis/ar/stopwords.txt
+See http://members.unine.ch/jacques.savoy/clef/index.html.
\ No newline at end of file

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java?rev=706342&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
(added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
Mon Oct 20 10:19:29 2008
@@ -0,0 +1,124 @@
+package org.apache.lucene.analysis.ar;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.HashSet;
+import java.util.Hashtable;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WordlistLoader;
+
+/**
+ * Analyzer for Arabic. 
+ * <p>
+ * This analyzer implements light-stemming as specified by:
+ * <i>
+ * Improving Stemming for Arabic Information Retrieval: 
+ *      Light Stemming and Co-occurrence Analysis
+ * </i>    
+ * http://ciir.cs.umass.edu/pubfiles/ir-249.pdf
+ * <p>
+ * The analysis package contains three primary components:
+ * <ul>
+ *  <li>{@link ArabicNormalizationFilter}: Arabic orthographic normalization.
+ *  <li>{@link ArabicStemFilter}: Arabic light stemming
+ *  <li>Arabic stop words file: a set of default Arabic stop words.
+ * </ul>
+ * 
+ */
+public final class ArabicAnalyzer extends Analyzer {
+
+  /**
+   * File containing default Arabic stopwords.
+   * 
+   * Default stopword list is from http://members.unine.ch/jacques.savoy/clef/index.html
+   * The stopword list is BSD-Licensed.
+   */
+  public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
+  /**
+   * Contains the stopwords used with the StopFilter.
+   */
+  private Set stoptable = new HashSet();
+  /**
+   * The comment character in the stopwords file.  All lines prefixed with this will be ignored
 
+   */
+  public static final String STOPWORDS_COMMENT = "#";
+
+  /**
+   * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+   */
+  public ArabicAnalyzer() {
+    try {
+      InputStream stream = ArabicAnalyzer.class.getResourceAsStream(DEFAULT_STOPWORD_FILE);
+      InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
+      stoptable = WordlistLoader.getWordSet(reader, STOPWORDS_COMMENT);
+      reader.close();
+      stream.close();
+    } catch (IOException e) {
+      // TODO: throw IOException
+      throw new RuntimeException(e);
+    }
+  }
+
+  /**
+   * Builds an analyzer with the given stop words.
+   */
+  public ArabicAnalyzer( String[] stopwords ) {
+    stoptable = StopFilter.makeStopSet( stopwords );
+  }
+
+  /**
+   * Builds an analyzer with the given stop words.
+   */
+  public ArabicAnalyzer( Hashtable stopwords ) {
+    stoptable = new HashSet(stopwords.keySet());
+  }
+
+  /**
+   * Builds an analyzer with the given stop words.  Lines can be commented out using {@link
#STOPWORDS_COMMENT}
+   */
+  public ArabicAnalyzer( File stopwords ) throws IOException {
+    stoptable = WordlistLoader.getWordSet( stopwords, STOPWORDS_COMMENT);
+  }
+
+
+  /**
+   * Creates a TokenStream which tokenizes all the text in the provided Reader.
+   *
+   * @return  A TokenStream build from a StandardTokenizer filtered with
+   * 			StandardFilter, StopFilter, ArabicNormalizationFilter and ArabicStemFilter.
+   */
+  public final TokenStream tokenStream(String fieldName, Reader reader) {
+    TokenStream result = new ArabicLetterTokenizer( reader );
+    result = new StopFilter( result, stoptable );
+    result = new ArabicNormalizationFilter( result );
+    result = new ArabicStemFilter( result );
+
+    return result;
+  }
+}
+

Propchange: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java?rev=706342&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java
(added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java
Mon Oct 20 10:19:29 2008
@@ -0,0 +1,43 @@
+package org.apache.lucene.analysis.ar;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.LetterTokenizer;
+
+/**
+ * The problem with the standard Letter tokenizer is that it fails on diacritics.
+ * Handling similar to this is necessary for Indic Scripts, Hebrew, Thaana, etc.
+ * 
+ *
+ */
+public class ArabicLetterTokenizer extends LetterTokenizer {
+
+  public ArabicLetterTokenizer(Reader in) {
+    super(in);
+  }
+
+  /** 
+   * Allows for Letter category or NonspacingMark category
+   * @see org.apache.lucene.analysis.LetterTokenizer#isTokenChar(char)
+   */
+  protected boolean isTokenChar(char c) {
+    return super.isTokenChar(c) || Character.getType(c) == Character.NON_SPACING_MARK;
+  }
+
+}

Propchange: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java?rev=706342&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java
(added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java
Mon Oct 20 10:19:29 2008
@@ -0,0 +1,53 @@
+package org.apache.lucene.analysis.ar;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * A TokenFilter that applies {@link ArabicNormalizer} to normalize the orthography.
+ * 
+ */
+
+public class ArabicNormalizationFilter extends TokenFilter {
+
+  protected ArabicNormalizer normalizer = null;
+
+  protected ArabicNormalizationFilter(TokenStream input) {
+    super(input);
+    normalizer = new ArabicNormalizer();
+  }
+
+
+
+  public Token next(Token reusableToken) throws IOException {
+    if ((reusableToken = input.next(reusableToken)) == null) {
+      return null;
+    } else {
+      int oldlen = reusableToken.termLength();
+      int newlen = normalizer.normalize(reusableToken.termBuffer(), oldlen);
+      if (oldlen != newlen)
+        reusableToken.setTermLength(newlen);
+      return reusableToken;
+    }
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java?rev=706342&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java
(added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java
Mon Oct 20 10:19:29 2008
@@ -0,0 +1,102 @@
+package org.apache.lucene.analysis.ar;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ *  Normalizer for Arabic.
+ *  <p>
+ *  Normalization is done in-place for efficiency, operating on a termbuffer.
+ *  <p>
+ *  Normalization is defined as:
+ *  <ul>
+ *  <li> Normalization of hamza with alef seat to a bare alef.
+ *  <li> Normalization of teh marbuta to heh
+ *  <li> Normalization of dotless yeh (alef maksura) to yeh.
+ *  <li> Removal of Arabic diacritics (the harakat)
+ *  <li> Removal of tatweel (stretching character).
+ * </ul>
+ *
+ */
+public class ArabicNormalizer {
+  public static final char ALEF = '\u0627';
+  public static final char ALEF_MADDA = '\u0622';
+  public static final char ALEF_HAMZA_ABOVE = '\u0623';
+  public static final char ALEF_HAMZA_BELOW = '\u0625';
+
+  public static final char YEH = '\u064A';
+  public static final char DOTLESS_YEH = '\u0649';
+
+  public static final char TEH_MARBUTA = '\u0629';
+  public static final char HEH = '\u0647';
+
+  public static final char TATWEEL = '\u0640';
+
+  public static final char FATHATAN = '\u064B';
+  public static final char DAMMATAN = '\u064C';
+  public static final char KASRATAN = '\u064D';
+  public static final char FATHA = '\u064E';
+  public static final char DAMMA = '\u064F';
+  public static final char KASRA = '\u0650';
+  public static final char SHADDA = '\u0651';
+  public static final char SUKUN = '\u0652';
+
+  /**
+   * Normalize an input buffer of Arabic text
+   * 
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return length of input buffer after normalization
+   */
+  public int normalize(char s[], int len) {
+ 
+    for (int i = 0; i < len; i++) {
+      if (s[i] == ALEF_MADDA || s[i] == ALEF_HAMZA_ABOVE || s[i] == ALEF_HAMZA_BELOW)
+        s[i] = ALEF;
+
+      if (s[i] == DOTLESS_YEH)
+        s[i] = YEH;
+
+      if (s[i] == TEH_MARBUTA)
+        s[i] = HEH;
+
+      if (s[i] == TATWEEL || s[i] == KASRATAN || s[i] == DAMMATAN || s[i] == FATHATAN ||
+          s[i] == FATHA || s[i] == DAMMA || s[i] == KASRA || s[i] == SHADDA || s[i] == SUKUN)
{
+        len = delete(s, i, len);
+        i--;
+      }
+    }
+
+    return len;
+  }
+
+  /**
+   * Delete a character in-place
+   * 
+   * @param s Input Buffer
+   * @param pos Position of character to delete
+   * @param len length of input buffer
+   * @return length of input buffer after deletion
+   */
+  protected int delete(char s[], int pos, int len) {
+    if (pos < len) 
+      System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
+
+    return len - 1;
+  }
+
+}

Propchange: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java?rev=706342&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java
(added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java
Mon Oct 20 10:19:29 2008
@@ -0,0 +1,61 @@
+package org.apache.lucene.analysis.ar;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * A TokenFilter that applies {@link ArabicStemmer} to stem Arabic words..
+ * 
+ */
+
+public class ArabicStemFilter extends TokenFilter {
+
+  protected ArabicStemmer stemmer = null;
+
+  protected ArabicStemFilter(TokenStream input) {
+    super(input);
+    stemmer = new ArabicStemmer();
+  }
+
+
+
+  /**
+   * @return  Returns the next token in the stream, or null at EOS
+   */
+  public Token next(Token reusableToken) throws IOException {
+    /**
+     * The actual token in the input stream.
+     */
+
+
+    if ((reusableToken = input.next(reusableToken)) == null) {
+      return null;
+    } else {
+      int oldlen = reusableToken.termLength();
+      int newlen = stemmer.stem(reusableToken.termBuffer(), oldlen);
+      if (oldlen != newlen)
+        reusableToken.setTermLength(newlen);
+      return reusableToken;
+    }
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java?rev=706342&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java
(added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java
Mon Oct 20 10:19:29 2008
@@ -0,0 +1,177 @@
+package org.apache.lucene.analysis.ar;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ *  Stemmer for Arabic.
+ *  <p>
+ *  Stemming  is done in-place for efficiency, operating on a termbuffer.
+ *  <p>
+ *  Stemming is defined as:
+ *  <ul>
+ *  <li> Removal of attached definite article, conjunction, and prepositions.
+ *  <li> Stemming of common suffixes.
+ * </ul>
+ *
+ */
+public class ArabicStemmer {
+  public static final char ALEF = '\u0627';
+  public static final char BEH = '\u0628';
+  public static final char TEH_MARBUTA = '\u0629';
+  public static final char TEH = '\u062A';
+  public static final char FEH = '\u0641';
+  public static final char KAF = '\u0643';
+  public static final char LAM = '\u0644';
+  public static final char NOON = '\u0646';
+  public static final char HEH = '\u0647';
+  public static final char WAW = '\u0648';
+  public static final char YEH = '\u064A';
+  
+  public static final char prefixes[][] = {
+      ("" + ALEF + LAM).toCharArray(), 
+      ("" + WAW + ALEF + LAM).toCharArray(), 
+      ("" + BEH + ALEF + LAM).toCharArray(),
+      ("" + KAF + ALEF + LAM).toCharArray(),
+      ("" + FEH + ALEF + LAM).toCharArray(),
+      ("" + WAW).toCharArray(),
+  };
+  
+  public static final char suffixes[][] = {
+    ("" + HEH + ALEF).toCharArray(), 
+    ("" + ALEF + NOON).toCharArray(), 
+    ("" + ALEF + TEH).toCharArray(), 
+    ("" + WAW + NOON).toCharArray(), 
+    ("" + YEH + NOON).toCharArray(), 
+    ("" + YEH + HEH).toCharArray(),
+    ("" + YEH + TEH_MARBUTA).toCharArray(),
+    ("" + HEH).toCharArray(),
+    ("" + TEH_MARBUTA).toCharArray(),
+    ("" + YEH).toCharArray(),
+};
+  
+  /**
+   * Stem an input buffer of Arabic text.
+   * 
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return length of input buffer after normalization
+   */
+  public int stem(char s[], int len) {
+    len = stemPrefix(s, len);
+    len = stemSuffix(s, len);
+    
+    return len;
+  }
+  
+  /**
+   * Stem a prefix off an Arabic word.
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return new length of input buffer after stemming.
+   */
+  public int stemPrefix(char s[], int len) {
+    for (int i = 0; i < prefixes.length; i++) 
+      if (startsWith(s, len, prefixes[i]))
+        return deleteN(s, 0, len, prefixes[i].length);
+    return len;
+  }
+
+  /**
+   * Stem suffix(es) off an Arabic word.
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return new length of input buffer after stemming
+   */
+  public int stemSuffix(char s[], int len) {
+    for (int i = 0; i < suffixes.length; i++) 
+      if (endsWith(s, len, suffixes[i]))
+        len = deleteN(s, len - suffixes[i].length, len, suffixes[i].length);
+    return len;
+  }
+  
+  /**
+   * Returns true if the prefix matches and can be stemmed
+   * @param s input buffer
+   * @param len length of input buffer
+   * @param prefix prefix to check
+   * @return true if the prefix matches and can be stemmed
+   */
+  boolean startsWith(char s[], int len, char prefix[]) {
+    if (prefix.length == 1 && len < 4) { // wa- prefix requires at least 3 characters
+      return false;
+    } else if (len < prefix.length + 2) { // other prefixes require only 2.
+      return false;
+    } else {
+      for (int i = 0; i < prefix.length; i++)
+        if (s[i] != prefix[i])
+          return false;
+        
+      return true;
+    }
+  }
+  
+  /**
+   * Returns true if the suffix matches and can be stemmed
+   * @param s input buffer
+   * @param len length of input buffer
+   * @param suffix suffix to check
+   * @return true if the suffix matches and can be stemmed
+   */
+  boolean endsWith(char s[], int len, char suffix[]) {
+    if (len < suffix.length + 2) { // all suffixes require at least 2 characters after
stemming
+      return false;
+    } else {
+      for (int i = 0; i < suffix.length; i++)
+        if (s[len - suffix.length + i] != suffix[i])
+          return false;
+        
+      return true;
+    }
+  }
+  
+  
+  /**
+   * Delete n characters in-place
+   * 
+   * @param s Input Buffer
+   * @param pos Position of character to delete
+   * @param len Length of input buffer
+   * @param nChars number of characters to delete
+   * @return length of input buffer after deletion
+   */
+  protected int deleteN(char s[], int pos, int len, int nChars) {
+    for (int i = 0; i < nChars; i++)
+      len = delete(s, pos, len);
+    return len;
+  }
+  
+  /**
+   * Delete a character in-place
+   * 
+   * @param s Input Buffer
+   * @param pos Position of character to delete
+   * @param len length of input buffer
+   * @return length of input buffer after deletion
+   */
+  protected int delete(char s[], int pos, int len) {
+    if (pos < len) 
+      System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
+    
+    return len - 1;
+  }
+  
+}

Propchange: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/package.html?rev=706342&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/package.html
(added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/package.html
Mon Oct 20 10:19:29 2008
@@ -0,0 +1,5 @@
+<html><head></head>
+<body>
+Analyzer for Arabic.
+</body>
+</html>

Propchange: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/package.html
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/stopwords.txt?rev=706342&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/stopwords.txt
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java?rev=706342&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java
(added)
+++ lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java
Mon Oct 20 10:19:29 2008
@@ -0,0 +1,106 @@
+package org.apache.lucene.analysis.ar;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.StringReader;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+
+/**
+ * Test the Arabic Normalization Filter
+ *
+ */
+public class TestArabicNormalizationFilter extends TestCase {
+
+  public void testAlifMadda() throws IOException {
+    check("آجن", "اجن");
+  }
+  
+  public void testAlifHamzaAbove() throws IOException {
+    check("أحمد", "احمد");
+  }
+  
+  public void testAlifHamzaBelow() throws IOException {
+    check("إعاذ", "اعاذ");
+  }
+  
+  public void testAlifMaksura() throws IOException {
+    check("بنى", "بني");
+  }
+
+  public void testTehMarbuta() throws IOException {
+    check("فاطمة", "فاطمه");
+  }
+  
+  public void testTatweel() throws IOException {
+    check("روبرـــــت", "روبرت");
+  }
+  
+  public void testFatha() throws IOException {
+    check("مَبنا", "مبنا");
+  }
+  
+  public void testKasra() throws IOException {
+    check("علِي", "علي");
+  }
+  
+  public void testDamma() throws IOException {
+    check("بُوات", "بوات");
+  }
+  
+  public void testFathatan() throws IOException {
+    check("ولداً", "ولدا");
+  }
+  
+  public void testKasratan() throws IOException {
+    check("ولدٍ", "ولد");
+  }
+  
+  public void testDammatan() throws IOException {
+    check("ولدٌ", "ولد");
+  }  
+  
+  public void testSukun() throws IOException {
+    check("نلْسون", "نلسون");
+  }
+  
+  public void testShaddah() throws IOException {
+    check("هتميّ", "هتمي");
+  }  
+  
+  private void check(final String input, final String expected) throws IOException {
+    ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input));
+    ArabicNormalizationFilter filter = new ArabicNormalizationFilter(tokenStream);
+    final Token reusableToken = new Token();
+    Token nextToken = filter.next(reusableToken);
+    if (nextToken == null)
+      fail();
+    assertEquals(expected, nextToken.term());
+    filter.close();
+  }
+
+}

Propchange: lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java?rev=706342&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java
(added)
+++ lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java
Mon Oct 20 10:19:29 2008
@@ -0,0 +1,129 @@
+package org.apache.lucene.analysis.ar;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.StringReader;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+
+/**
+ * Test the Arabic Normalization Filter
+ *
+ */
+public class TestArabicStemFilter extends TestCase {
+  
+  public void testAlPrefix() throws IOException {
+    check("الحسن", "حسن");
+  }    
+
+  public void testWalPrefix() throws IOException {
+    check("والحسن", "حسن");
+  }    
+  
+  public void testBalPrefix() throws IOException {
+    check("بالحسن", "حسن");
+  }    
+  
+  public void testKalPrefix() throws IOException {
+    check("كالحسن", "حسن");
+  }    
+  
+  public void testFalPrefix() throws IOException {
+    check("فالحسن", "حسن");
+  }    
+
+  public void testWaPrefix() throws IOException {
+    check("وحسن", "حسن");
+  } 
+  
+  public void testAhSuffix() throws IOException {
+    check("زوجها", "زوج");
+  } 
+  
+  public void testAnSuffix() throws IOException {
+    check("ساهدان", "ساهد");
+  } 
+  
+  public void testAtSuffix() throws IOException {
+    check("ساهدات", "ساهد");
+  } 
+  
+  public void testWnSuffix() throws IOException {
+    check("ساهدون", "ساهد");
+  } 
+  
+  public void testYnSuffix() throws IOException {
+    check("ساهدين", "ساهد");
+  } 
+  
+  public void testYhSuffix() throws IOException {
+    check("ساهديه", "ساهد");
+  } 
+
+  public void testYpSuffix() throws IOException {
+    check("ساهدية", "ساهد");
+  } 
+  
+  public void testHSuffix() throws IOException {
+    check("ساهده", "ساهد");
+  } 
+  
+  public void testPSuffix() throws IOException {
+    check("ساهدة", "ساهد");
+  }
+  
+  public void testYSuffix() throws IOException {
+    check("ساهدي", "ساهد");
+  }
+  
+  public void testComboPrefSuf() throws IOException {
+    check("وساهدون", "ساهد");
+  }
+  
+  public void testComboSuf() throws IOException {
+    check("ساهدهات", "ساهد");
+  }
+  
+  public void testShouldntStem() throws IOException {
+    check("الو", "الو");
+  }
+
+  public void testNonArabic() throws IOException {
+    check("English", "English");
+  }
+
+  private void check(final String input, final String expected) throws IOException {
+    ArabicLetterTokenizer tokenStream  = new ArabicLetterTokenizer(new StringReader(input));
+    ArabicStemFilter filter = new ArabicStemFilter(tokenStream);
+    final Token reusableToken = new Token();
+    Token nextToken = filter.next(reusableToken);
+    if (nextToken == null)
+      fail();
+    assertEquals(expected, nextToken.term());
+    filter.close();
+  }
+
+}

Propchange: lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/WordlistLoader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/WordlistLoader.java?rev=706342&r1=706341&r2=706342&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/WordlistLoader.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/WordlistLoader.java Mon Oct 20 10:19:29
2008
@@ -57,6 +57,31 @@
   }
 
   /**
+   * Loads a text file and adds every non-comment line as an entry to a HashSet (omitting
+   * leading and trailing whitespace). Every line of the file should contain only
+   * one word. The words need to be in lowercase if you make use of an
+   * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+   *
+   * @param wordfile File containing the wordlist
+   * @param comment The comment string to ignore
+   * @return A HashSet with the file's words
+   */
+  public static HashSet getWordSet(File wordfile, String comment) throws IOException {
+    HashSet result = new HashSet();
+    FileReader reader = null;
+    try {
+      reader = new FileReader(wordfile);
+      result = getWordSet(reader, comment);
+    }
+    finally {
+      if (reader != null)
+        reader.close();
+    }
+    return result;
+  }
+
+
+  /**
    * Reads lines from a Reader and adds every line as an entry to a HashSet (omitting
    * leading and trailing whitespace). Every line of the Reader should contain only
    * one word. The words need to be in lowercase if you make use of an
@@ -87,6 +112,41 @@
   }
 
   /**
+   * Reads lines from a Reader and adds every non-comment line as an entry to a HashSet (omitting
+   * leading and trailing whitespace). Every line of the Reader should contain only
+   * one word. The words need to be in lowercase if you make use of an
+   * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+   *
+   * @param reader Reader containing the wordlist
+   * @param comment The string representing a comment.
+   * @return A HashSet with the reader's words
+   */
+  public static HashSet getWordSet(Reader reader, String comment) throws IOException {
+    HashSet result = new HashSet();
+    BufferedReader br = null;
+    try {
+      if (reader instanceof BufferedReader) {
+        br = (BufferedReader) reader;
+      } else {
+        br = new BufferedReader(reader);
+      }
+      String word = null;
+      while ((word = br.readLine()) != null) {
+        if (word.startsWith(comment) == false){
+          result.add(word.trim());
+        }
+      }
+    }
+    finally {
+      if (br != null)
+        br.close();
+    }
+    return result;
+  }
+
+
+
+  /**
    * Reads a stem dictionary. Each line contains:
    * <pre>word<b>\t</b>stem</pre>
    * (i.e. two tab seperated words)

Modified: lucene/java/trunk/src/test/org/apache/lucene/index/TestWordlistLoader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/index/TestWordlistLoader.java?rev=706342&r1=706341&r2=706342&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/index/TestWordlistLoader.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/index/TestWordlistLoader.java Mon Oct 20
10:19:29 2008
@@ -35,7 +35,16 @@
     HashSet wordSet2 = WordlistLoader.getWordSet(new BufferedReader(new StringReader(s)));
     checkSet(wordSet2);
   }
-  
+
+  public void testComments() throws Exception {
+    String s = "ONE\n  two \nthree\n#comment";
+    HashSet wordSet1 = WordlistLoader.getWordSet(new StringReader(s), "#");
+    checkSet(wordSet1);
+    assertFalse(wordSet1.contains("#comment"));
+    assertFalse(wordSet1.contains("comment"));
+  }
+
+
   private void checkSet(HashSet wordset) {
     assertEquals(3, wordset.size());
     assertTrue(wordset.contains("ONE"));		// case is not modified



Mime
View raw message