lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r886190 - in /lucene/java/trunk: ./ contrib/ contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/ contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/ contrib/analyzers/common/src/test/org/apache/lucene/analysis...
Date Wed, 02 Dec 2009 16:08:57 GMT
Author: rmuir
Date: Wed Dec  2 16:08:56 2009
New Revision: 886190

URL: http://svn.apache.org/viewvc?rev=886190&view=rev
Log:
LUCENE-2062: Bulgarian Analyzer

Added:
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
  (with props)
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java
  (with props)
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java
  (with props)
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/package.html
  (with props)
    lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/
    lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt
  (with props)
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java
  (with props)
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java
  (with props)
Modified:
    lucene/java/trunk/NOTICE.txt
    lucene/java/trunk/contrib/CHANGES.txt

Modified: lucene/java/trunk/NOTICE.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/NOTICE.txt?rev=886190&r1=886189&r2=886190&view=diff
==============================================================================
--- lucene/java/trunk/NOTICE.txt (original)
+++ lucene/java/trunk/NOTICE.txt Wed Dec  2 16:08:56 2009
@@ -20,6 +20,11 @@
 contrib/analyzers/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt.
 See http://members.unine.ch/jacques.savoy/clef/index.html.
 
+The Bulgarian analyzer (contrib/analyzers) comes with a default
+stopword list that is BSD-licensed created by Jacques Savoy.  The file resides in
+contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt.
+See http://members.unine.ch/jacques.savoy/clef/index.html.
+
 Includes lib/servlet-api-2.4.jar from  Apache Tomcat
 
 The SmartChineseAnalyzer source code (under contrib/analyzers) was

Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=886190&r1=886189&r2=886190&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Wed Dec  2 16:08:56 2009
@@ -15,6 +15,8 @@
  * LUCENE-2067: Add a Czech light stemmer. CzechAnalyzer will now stem words
    when Version is set to 3.1 or higher.  (Robert Muir)
    
+ * LUCENE-2062: Add a Bulgarian analyzer.  (Robert Muir, Simon Willnauer)
+   
 
 ======================= Release 3.0.0 2009-11-25 =======================
 

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java?rev=886190&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
(added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
Wed Dec  2 16:08:56 2009
@@ -0,0 +1,176 @@
+package org.apache.lucene.analysis.bg;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+
+/**
+ * {@link Analyzer} for Bulgarian.
+ * <p>
+ * This analyzer implements light-stemming as specified by: <i> Searching
+ * Strategies for the Bulgarian Language </i>
+ * http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf
+ * <p>
+ */
+public final class BulgarianAnalyzer extends Analyzer {
+  
+  /**
+   * File containing default Bulgarian stopwords.
+   * 
+   * Default stopword list is from
+   * http://members.unine.ch/jacques.savoy/clef/index.html The stopword list is
+   * BSD-Licensed.
+   */
+  public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+  
+  /**
+   * Contains the stopwords used with the StopFilter.
+   */
+  private final Set<?> stoptable;
+  /**
+   * The comment character in the stopwords file. All lines prefixed with this
+   * will be ignored
+   */
+  public static final String STOPWORDS_COMMENT = "#";
+  
+  /**
+   * Returns an unmodifiable instance of the default stop-words set.
+   * 
+   * @return an unmodifiable instance of the default stop-words set.
+   */
+  public static Set<String> getDefaultStopSet() {
+    return DefaultSetHolder.DEFAULT_STOP_SET;
+  }
+  
+  /**
+   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer
+   * class accesses the static final set the first time.;
+   */
+  private static class DefaultSetHolder {
+    static final Set<String> DEFAULT_STOP_SET;
+    
+    static {
+      try {
+        DEFAULT_STOP_SET = loadDefaultStopWordSet();
+      } catch (Exception ex) {
+        // default set should always be present as it is part of the
+        // distribution (JAR)
+        throw new RuntimeException("Unable to load default stopword set", ex);
+      }
+    }
+    
+    static Set<String> loadDefaultStopWordSet() throws IOException {
+      final InputStream stream = BulgarianAnalyzer.class
+          .getResourceAsStream(DEFAULT_STOPWORD_FILE);
+      try {
+        InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
+        // make sure it is unmodifiable as we expose it in the outer class
+        return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader,
+            STOPWORDS_COMMENT));
+      } finally {
+        if(stream != null)
+          stream.close();
+      }
+    }
+  }
+  
+  private final Version matchVersion;
+  
+  /**
+   * Builds an analyzer with the default stop words:
+   * {@link #DEFAULT_STOPWORD_FILE}.
+   */
+  public BulgarianAnalyzer(Version matchVersion) {
+    this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+  }
+  
+  /**
+   * Builds an analyzer with the given stop words.
+   */
+  public BulgarianAnalyzer(Version matchVersion, Set<?> stopwords) {
+    super();
+    stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion,
+        stopwords));
+    this.matchVersion = matchVersion;
+  }
+  
+  /**
+   * Creates a {@link TokenStream} which tokenizes all the text in the provided
+   * {@link Reader}.
+   * 
+   * @return A {@link TokenStream} built from an {@link StandardTokenizer}
+   *         filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+   *         {@link StopFilter}, and {@link BulgarianStemFilter}.
+   */
+  @Override
+  public TokenStream tokenStream(String fieldName, Reader reader) {
+    TokenStream result = new StandardTokenizer(matchVersion, reader);
+    result = new StandardFilter(result);
+    result = new LowerCaseFilter(matchVersion, result);
+    result = new StopFilter(matchVersion, result, stoptable);
+    result = new BulgarianStemFilter(result);
+    return result;
+  }
+  
+  private class SavedStreams {
+    Tokenizer source;
+    TokenStream result;
+  };
+  
+  /**
+   * Returns a (possibly reused) {@link TokenStream} which tokenizes all the
+   * text in the provided {@link Reader}.
+   * 
+   * @return A {@link TokenStream} built from an {@link StandardTokenizer}
+   *         filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+   *         {@link StopFilter}, and {@link BulgarianStemFilter}.
+   */
+  @Override
+  public TokenStream reusableTokenStream(String fieldName, Reader reader)
+      throws IOException {
+    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
+    if (streams == null) {
+      streams = new SavedStreams();
+      streams.source = new StandardTokenizer(matchVersion, reader);
+      streams.result = new StandardFilter(streams.source);
+      streams.result = new LowerCaseFilter(matchVersion, streams.result);
+      streams.result = new StopFilter(matchVersion, streams.result, stoptable);
+      streams.result = new BulgarianStemFilter(streams.result);
+      setPreviousTokenStream(streams);
+    } else {
+      streams.source.reset(reader);
+    }
+    return streams.result;
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java?rev=886190&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java
(added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java
Wed Dec  2 16:08:56 2009
@@ -0,0 +1,50 @@
+package org.apache.lucene.analysis.bg;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link BulgarianStemmer} to stem Bulgarian
+ * words.
+ */
+public final class BulgarianStemFilter extends TokenFilter {
+  private final BulgarianStemmer stemmer;
+  private final TermAttribute termAtt;
+  
+  public BulgarianStemFilter(final TokenStream input) {
+    super(input);
+    stemmer = new BulgarianStemmer();
+    termAtt = addAttribute(TermAttribute.class);
+  }
+  
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      final int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength());
+      termAtt.setTermLength(newlen);
+      return true;
+    } else {
+      return false;
+    }
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java?rev=886190&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java
(added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java
Wed Dec  2 16:08:56 2009
@@ -0,0 +1,152 @@
+package org.apache.lucene.analysis.bg;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Light Stemmer for Bulgarian.
+ * <p>
+ * Implements the algorithm described in:  
+ * <i>
+ * Searching Strategies for the Bulgarian Language
+ * </i>
+ * http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf
+ */
+public class BulgarianStemmer {
+  
+  /**
+   * Stem an input buffer of Bulgarian text.
+   * 
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return length of input buffer after normalization
+   */
+  public int stem(final char s[], int len) {
+    if (len < 4) // do not stem
+      return len;
+    
+    if (len > 5 && endsWith(s, len, "ища"))
+      return len - 3;
+    
+    len = removeArticle(s, len);
+    len = removePlural(s, len);
+    
+    if (len > 3) {
+      if (endsWith(s, len, "я"))
+        len--;
+      if (endsWith(s, len, "а") ||
+          endsWith(s, len, "о") ||
+          endsWith(s, len, "е"))
+        len--;
+    }
+    
+    // the rule to rewrite ен -> н is duplicated in the paper.
+    // in the perl implementation referenced by the paper, this is fixed.
+    // (it is fixed here as well)
+    if (len > 4 && endsWith(s, len, "ен")) {
+      s[len - 2] = 'н'; // replace with н
+      len--;
+    }
+    
+    if (len > 5 && s[len - 2] == 'ъ') {
+      s[len - 2] = s[len - 1]; // replace ъN with N
+      len--;
+    }
+
+    return len;
+  }
+  
+  /**
+   * Mainly remove the definite article
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return new stemmed length
+   */
+  private int removeArticle(final char s[], final int len) {
+    if (len > 6 && endsWith(s, len, "ият"))
+      return len - 3;
+    
+    if (len > 5) {
+      if (endsWith(s, len, "ът") ||
+          endsWith(s, len, "то") ||
+          endsWith(s, len, "те") ||
+          endsWith(s, len, "та") ||
+          endsWith(s, len, "ия"))
+        return len - 2;
+    }
+    
+    if (len > 4 && endsWith(s, len, "ят"))
+      return len - 2;
+
+    return len;
+  }
+  
+  private int removePlural(final char s[], final int len) {
+    if (len > 6) {
+      if (endsWith(s, len, "овци"))
+        return len - 3; // replace with о
+      if (endsWith(s, len, "ове"))
+        return len - 3;
+      if (endsWith(s, len, "еве")) {
+        s[len - 3] = 'й'; // replace with й
+        return len - 2;
+      }
+    }
+    
+    if (len > 5) {
+      if (endsWith(s, len, "ища"))
+        return len - 3;
+      if (endsWith(s, len, "та"))
+        return len - 2;
+      if (endsWith(s, len, "ци")) {
+        s[len - 2] = 'к'; // replace with к
+        return len - 1;
+      }
+      if (endsWith(s, len, "зи")) {
+        s[len - 2] = 'г'; // replace with г
+        return len - 1;
+      }
+      
+      if (s[len - 3] == 'е' && s[len - 1] == 'и') {
+        s[len - 3] = 'я'; // replace е with я, remove и
+        return len - 1;
+      }
+    }
+    
+    if (len > 4) {
+      if (endsWith(s, len, "си")) {
+        s[len - 2] = 'х'; // replace with х
+        return len - 1;
+      }
+      if (endsWith(s, len, "и"))
+        return len - 1;
+    }
+    
+    return len;
+  }
+  
+  private boolean endsWith(final char s[], final int len, final String suffix) {
+    final int suffixLen = suffix.length();
+    if (suffixLen > len)
+      return false;
+    for (int i = suffixLen - 1; i >= 0; i--)
+      if (s[len -(suffixLen - i)] != suffix.charAt(i))
+        return false;
+    
+    return true;
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/package.html?rev=886190&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/package.html
(added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/package.html
Wed Dec  2 16:08:56 2009
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Bulgarian.
+</body>
+</html>

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/package.html
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt?rev=886190&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt
(added)
+++ lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt
Wed Dec  2 16:08:56 2009
@@ -0,0 +1,193 @@
+# This file was created by Jacques Savoy and is distributed under the BSD license.
+# See http://members.unine.ch/jacques.savoy/clef/index.html.
+# Also see http://www.opensource.org/licenses/bsd-license.html
+а
+аз
+ако
+ала
+бе
+без
+беше
+би
+бил
+била
+били
+било
+близо
+бъдат
+бъде
+бяха
+в
+вас
+ваш
+ваша
+вероятно
+вече
+взема
+ви
+вие
+винаги
+все
+всеки
+всички
+всичко
+всяка
+във
+въпреки
+върху
+г
+ги
+главно
+го
+д
+да
+дали
+до
+докато
+докога
+дори
+досега
+доста
+е
+едва
+един
+ето
+за
+зад
+заедно
+заради
+засега
+затова
+защо
+защото
+и
+из
+или
+им
+има
+имат
+иска
+й
+каза
+как
+каква
+какво
+както
+какъв
+като
+кога
+когато
+което
+които
+кой
+който
+колко
+която
+къде
+където
+към
+ли
+м
+ме
+между
+мен
+ми
+мнозина
+мога
+могат
+може
+моля
+момента
+му
+н
+на
+над
+назад
+най
+направи
+напред
+например
+нас
+не
+него
+нея
+ни
+ние
+никой
+нито
+но
+някои
+някой
+няма
+обаче
+около
+освен
+особено
+от
+отгоре
+отново
+още
+пак
+по
+повече
+повечето
+под
+поне
+поради
+после
+почти
+прави
+пред
+преди
+през
+при
+пък
+първо
+с
+са
+само
+се
+сега
+си
+скоро
+след
+сме
+според
+сред
+срещу
+сте
+съм
+със
+също
+т
+тази
+така
+такива
+такъв
+там
+твой
+те
+тези
+ти
+тн
+то
+това
+тогава
+този
+той
+толкова
+точно
+трябва
+тук
+тъй
+тя
+тях
+у
+харесва
+ч
+че
+често
+чрез
+ще
+щом
+я

Propchange: lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java?rev=886190&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java
(added)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java
Wed Dec  2 16:08:56 2009
@@ -0,0 +1,70 @@
+package org.apache.lucene.analysis.bg;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Collections;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
+
+/**
+ * Test the Bulgarian analyzer
+ */
+public class TestBulgarianAnalyzer extends BaseTokenStreamTestCase {
+  
+  /**
+   * This test fails with NPE when the stopwords file is missing in classpath
+   */
+  public void testResourcesAvailable() {
+    new BulgarianAnalyzer(Version.LUCENE_CURRENT);
+  }
+  
+  public void testStopwords() throws IOException {
+    Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
+    assertAnalyzesTo(a, "Как се казваш?", new String[] {"казваш"});
+  }
+  
+  public void testCustomStopwords() throws IOException {
+    Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT, Collections
+        .emptySet());
+    assertAnalyzesTo(a, "Как се казваш?", 
+        new String[] {"как", "се", "казваш"});
+  }
+  
+  public void testReusableTokenStream() throws IOException {
+    Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
+    assertAnalyzesToReuse(a, "документи", new String[] {"документ"});
+    assertAnalyzesToReuse(a, "документ", new String[] {"документ"});
+  }
+  
+  /**
+   * Test some examples from the paper
+   */
+  public void testBasicExamples() throws IOException {
+    Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
+    assertAnalyzesTo(a, "енергийни кризи", new
String[] {"енергийн", "криз"});
+    assertAnalyzesTo(a, "Атомната енергия",
new String[] {"атомн", "енерг"});
+    
+    assertAnalyzesTo(a, "компютри", new String[] {"компютр"});
+    assertAnalyzesTo(a, "компютър", new String[] {"компютр"});
+    
+    assertAnalyzesTo(a, "градове", new String[] {"град"});
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java?rev=886190&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java
(added)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java
Wed Dec  2 16:08:56 2009
@@ -0,0 +1,210 @@
+package org.apache.lucene.analysis.bg;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
+
+/**
+ * Test the Bulgarian Stemmer
+ */
+public class TestBulgarianStemmer extends BaseTokenStreamTestCase {
+  /**
+   * Test showing how masculine noun forms conflate. An example noun for each
+   * common (and some rare) plural pattern is listed.
+   */
+  public void testMasculineNouns() throws IOException {
+    BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
+    
+    // -и pattern
+    assertAnalyzesTo(a, "град", new String[] {"град"});
+    assertAnalyzesTo(a, "града", new String[] {"град"});
+    assertAnalyzesTo(a, "градът", new String[] {"град"});
+    assertAnalyzesTo(a, "градове", new String[] {"град"});
+    assertAnalyzesTo(a, "градовете", new String[] {"град"});
+    
+    // -ове pattern
+    assertAnalyzesTo(a, "народ", new String[] {"народ"});
+    assertAnalyzesTo(a, "народа", new String[] {"народ"});
+    assertAnalyzesTo(a, "народът", new String[] {"народ"});
+    assertAnalyzesTo(a, "народи", new String[] {"народ"});
+    assertAnalyzesTo(a, "народите", new String[] {"народ"});
+    assertAnalyzesTo(a, "народе", new String[] {"народ"});
+    
+    // -ища pattern
+    assertAnalyzesTo(a, "път", new String[] {"път"});
+    assertAnalyzesTo(a, "пътя", new String[] {"път"});
+    assertAnalyzesTo(a, "пътят", new String[] {"път"});
+    assertAnalyzesTo(a, "пътища", new String[] {"път"});
+    assertAnalyzesTo(a, "пътищата", new String[] {"път"});
+    
+    // -чета pattern
+    assertAnalyzesTo(a, "градец", new String[] {"градец"});
+    assertAnalyzesTo(a, "градеца", new String[] {"градец"});
+    assertAnalyzesTo(a, "градецът", new String[] {"градец"});
+    /* note the below forms conflate with each other, but not the rest */
+    assertAnalyzesTo(a, "градовце", new String[] {"градовц"});
+    assertAnalyzesTo(a, "градовцете", new String[] {"градовц"});
+    
+    // -овци pattern
+    assertAnalyzesTo(a, "дядо", new String[] {"дяд"});
+    assertAnalyzesTo(a, "дядото", new String[] {"дяд"});
+    assertAnalyzesTo(a, "дядовци", new String[] {"дяд"});
+    assertAnalyzesTo(a, "дядовците", new String[] {"дяд"});
+    
+    // -е pattern
+    assertAnalyzesTo(a, "мъж", new String[] {"мъж"});
+    assertAnalyzesTo(a, "мъжа", new String[] {"мъж"});
+    assertAnalyzesTo(a, "мъже", new String[] {"мъж"});
+    assertAnalyzesTo(a, "мъжете", new String[] {"мъж"});
+    assertAnalyzesTo(a, "мъжо", new String[] {"мъж"});
+    /* word is too short, will not remove -ът */
+    assertAnalyzesTo(a, "мъжът", new String[] {"мъжът"});
+    
+    // -а pattern
+    assertAnalyzesTo(a, "крак", new String[] {"крак"});
+    assertAnalyzesTo(a, "крака", new String[] {"крак"});
+    assertAnalyzesTo(a, "кракът", new String[] {"крак"});
+    assertAnalyzesTo(a, "краката", new String[] {"крак"});
+    
+    // брат
+    assertAnalyzesTo(a, "брат", new String[] {"брат"});
+    assertAnalyzesTo(a, "брата", new String[] {"брат"});
+    assertAnalyzesTo(a, "братът", new String[] {"брат"});
+    assertAnalyzesTo(a, "братя", new String[] {"брат"});
+    assertAnalyzesTo(a, "братята", new String[] {"брат"});
+    assertAnalyzesTo(a, "брате", new String[] {"брат"});
+  }
+  
+  /**
+   * Test showing how feminine noun forms conflate
+   */
+  public void testFeminineNouns() throws IOException {
+    BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
+    
+    assertAnalyzesTo(a, "вест", new String[] {"вест"});
+    assertAnalyzesTo(a, "вестта", new String[] {"вест"});
+    assertAnalyzesTo(a, "вести", new String[] {"вест"});
+    assertAnalyzesTo(a, "вестите", new String[] {"вест"});
+  }
+  
+  /**
+   * Test showing how neuter noun forms conflate an example noun for each common
+   * plural pattern is listed
+   */
+  public void testNeuterNouns() throws IOException {
+    BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
+    
+    // -а pattern
+    assertAnalyzesTo(a, "дърво", new String[] {"дърв"});
+    assertAnalyzesTo(a, "дървото", new String[] {"дърв"});
+    assertAnalyzesTo(a, "дърва", new String[] {"дърв"});
+    assertAnalyzesTo(a, "дървета", new String[] {"дърв"});
+    assertAnalyzesTo(a, "дървата", new String[] {"дърв"});
+    assertAnalyzesTo(a, "дърветата", new String[] {"дърв"});
+    
+    // -та pattern
+    assertAnalyzesTo(a, "море", new String[] {"мор"});
+    assertAnalyzesTo(a, "морето", new String[] {"мор"});
+    assertAnalyzesTo(a, "морета", new String[] {"мор"});
+    assertAnalyzesTo(a, "моретата", new String[] {"мор"});
+    
+    // -я pattern
+    assertAnalyzesTo(a, "изключение", new String[] {"изключени"});
+    assertAnalyzesTo(a, "изключението", new String[]
{"изключени"});
+    assertAnalyzesTo(a, "изключенията", new String[]
{"изключени"});
+    /* note the below form in this example does not conflate with the rest */
+    assertAnalyzesTo(a, "изключения", new String[] {"изключн"});
+  }
+  
+  /**
+   * Test showing how adjectival forms conflate
+   */
+  public void testAdjectives() throws IOException {
+    BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
+    assertAnalyzesTo(a, "красив", new String[] {"красив"});
+    assertAnalyzesTo(a, "красивия", new String[] {"красив"});
+    assertAnalyzesTo(a, "красивият", new String[] {"красив"});
+    assertAnalyzesTo(a, "красива", new String[] {"красив"});
+    assertAnalyzesTo(a, "красивата", new String[] {"красив"});
+    assertAnalyzesTo(a, "красиво", new String[] {"красив"});
+    assertAnalyzesTo(a, "красивото", new String[] {"красив"});
+    assertAnalyzesTo(a, "красиви", new String[] {"красив"});
+    assertAnalyzesTo(a, "красивите", new String[] {"красив"});
+  }
+  
+  /**
+   * Test some exceptional rules, implemented as rewrites.
+   */
+  public void testExceptions() throws IOException {
+    BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
+    
+    // ци -> к
+    assertAnalyzesTo(a, "собственик", new String[] {"собственик"});
+    assertAnalyzesTo(a, "собственика", new String[] {"собственик"});
+    assertAnalyzesTo(a, "собственикът", new String[]
{"собственик"});
+    assertAnalyzesTo(a, "собственици", new String[] {"собственик"});
+    assertAnalyzesTo(a, "собствениците", new String[]
{"собственик"});
+    
+    // зи -> г
+    assertAnalyzesTo(a, "подлог", new String[] {"подлог"});
+    assertAnalyzesTo(a, "подлога", new String[] {"подлог"});
+    assertAnalyzesTo(a, "подлогът", new String[] {"подлог"});
+    assertAnalyzesTo(a, "подлози", new String[] {"подлог"});
+    assertAnalyzesTo(a, "подлозите", new String[] {"подлог"});
+    
+    // си -> х
+    assertAnalyzesTo(a, "кожух", new String[] {"кожух"});
+    assertAnalyzesTo(a, "кожуха", new String[] {"кожух"});
+    assertAnalyzesTo(a, "кожухът", new String[] {"кожух"});
+    assertAnalyzesTo(a, "кожуси", new String[] {"кожух"});
+    assertAnalyzesTo(a, "кожусите", new String[] {"кожух"});
+    
+    // ъ deletion
+    assertAnalyzesTo(a, "център", new String[] {"центр"});
+    assertAnalyzesTo(a, "центъра", new String[] {"центр"});
+    assertAnalyzesTo(a, "центърът", new String[] {"центр"});
+    assertAnalyzesTo(a, "центрове", new String[] {"центр"});
+    assertAnalyzesTo(a, "центровете", new String[] {"центр"});
+    
+    // е*и -> я*
+    assertAnalyzesTo(a, "промяна", new String[] {"промян"});
+    assertAnalyzesTo(a, "промяната", new String[] {"промян"});
+    assertAnalyzesTo(a, "промени", new String[] {"промян"});
+    assertAnalyzesTo(a, "промените", new String[] {"промян"});
+    
+    // ен -> н
+    assertAnalyzesTo(a, "песен", new String[] {"песн"});
+    assertAnalyzesTo(a, "песента", new String[] {"песн"});
+    assertAnalyzesTo(a, "песни", new String[] {"песн"});
+    assertAnalyzesTo(a, "песните", new String[] {"песн"});
+    
+    // -еве -> й
+    // note: this is the only word i think this rule works for.
+    // most -еве pluralized nouns are monosyllabic,
+    // and the stemmer requires length > 6...
+    assertAnalyzesTo(a, "строй", new String[] {"строй"});
+    assertAnalyzesTo(a, "строеве", new String[] {"строй"});
+    assertAnalyzesTo(a, "строевете", new String[] {"строй"});
+    /* note the below forms conflate with each other, but not the rest */
+    assertAnalyzesTo(a, "строя", new String[] {"стр"});
+    assertAnalyzesTo(a, "строят", new String[] {"стр"});
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java
------------------------------------------------------------------------------
    svn:eol-style = native



Mime
View raw message