Mailing-List: contact java-commits-help@lucene.apache.org; run by ezmlm
Precedence: bulk
Reply-To: java-dev@lucene.apache.org
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Subject: svn commit: r886190 - in /lucene/java/trunk: ./ contrib/
 contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/
 contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/
 contrib/analyzers/common/src/test/org/apache/lucene/analysis...
Date: Wed, 02 Dec 2009 16:08:57 -0000
To: java-commits@lucene.apache.org
From: rmuir@apache.org
Message-Id: <20091202160857.971BC23888EC@eris.apache.org>

Author: rmuir
Date: Wed Dec  2 16:08:56 2009
New Revision: 886190

URL: http://svn.apache.org/viewvc?rev=886190&view=rev
Log:
LUCENE-2062: Bulgarian Analyzer

Added:
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java   (with props)
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java   (with props)
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java   (with props)
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/package.html   (with props)
    lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/
    lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt   (with props)
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java   (with props)
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java   (with props)
Modified:
    lucene/java/trunk/NOTICE.txt
    lucene/java/trunk/contrib/CHANGES.txt

Modified: lucene/java/trunk/NOTICE.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/NOTICE.txt?rev=886190&r1=886189&r2=886190&view=diff
==============================================================================
--- lucene/java/trunk/NOTICE.txt (original)
+++ lucene/java/trunk/NOTICE.txt Wed Dec  2 16:08:56 2009
@@ -20,6 +20,11 @@
 contrib/analyzers/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt.
 See http://members.unine.ch/jacques.savoy/clef/index.html.
 
+The Bulgarian analyzer (contrib/analyzers) comes with a default
+stopword list that is BSD-licensed created by Jacques Savoy.  The file resides in
+contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt.
+See http://members.unine.ch/jacques.savoy/clef/index.html.
+
 Includes lib/servlet-api-2.4.jar from  Apache Tomcat
 
 The SmartChineseAnalyzer source code (under contrib/analyzers) was

Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=886190&r1=886189&r2=886190&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Wed Dec  2 16:08:56 2009
@@ -15,6 +15,8 @@
  * LUCENE-2067: Add a Czech light stemmer. CzechAnalyzer will now stem words
    when Version is set to 3.1 or higher.  (Robert Muir)
    
+ * LUCENE-2062: Add a Bulgarian analyzer.  (Robert Muir, Simon Willnauer)
+   
 
 ======================= Release 3.0.0 2009-11-25 =======================
 

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java?rev=886190&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java Wed Dec  2 16:08:56 2009
@@ -0,0 +1,176 @@
+package org.apache.lucene.analysis.bg;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+
+/**
+ * {@link Analyzer} for Bulgarian.
+ * <p>
+ * This analyzer implements light-stemming as specified by: <i> Searching
+ * Strategies for the Bulgarian Language </i>
+ * http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf
+ * <p>
+ */
+public final class BulgarianAnalyzer extends Analyzer {
+  
+  /**
+   * File containing default Bulgarian stopwords.
+   * 
+   * Default stopword list is from
+   * http://members.unine.ch/jacques.savoy/clef/index.html The stopword list is
+   * BSD-Licensed.
+   */
+  public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+  
+  /**
+   * Contains the stopwords used with the StopFilter.
+   */
+  private final Set<?> stoptable;
+  /**
+   * The comment character in the stopwords file. All lines prefixed with this
+   * will be ignored
+   */
+  public static final String STOPWORDS_COMMENT = "#";
+  
+  /**
+   * Returns an unmodifiable instance of the default stop-words set.
+   * 
+   * @return an unmodifiable instance of the default stop-words set.
+   */
+  public static Set<String> getDefaultStopSet() {
+    return DefaultSetHolder.DEFAULT_STOP_SET;
+  }
+  
+  /**
+   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer
+   * class accesses the static final set the first time.;
+   */
+  private static class DefaultSetHolder {
+    static final Set<String> DEFAULT_STOP_SET;
+    
+    static {
+      try {
+        DEFAULT_STOP_SET = loadDefaultStopWordSet();
+      } catch (Exception ex) {
+        // default set should always be present as it is part of the
+        // distribution (JAR)
+        throw new RuntimeException("Unable to load default stopword set", ex);
+      }
+    }
+    
+    static Set<String> loadDefaultStopWordSet() throws IOException {
+      final InputStream stream = BulgarianAnalyzer.class
+          .getResourceAsStream(DEFAULT_STOPWORD_FILE);
+      try {
+        InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
+        // make sure it is unmodifiable as we expose it in the outer class
+        return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader,
+            STOPWORDS_COMMENT));
+      } finally {
+        if(stream != null)
+          stream.close();
+      }
+    }
+  }
+  
+  private final Version matchVersion;
+  
+  /**
+   * Builds an analyzer with the default stop words:
+   * {@link #DEFAULT_STOPWORD_FILE}.
+   */
+  public BulgarianAnalyzer(Version matchVersion) {
+    this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+  }
+  
+  /**
+   * Builds an analyzer with the given stop words.
+   */
+  public BulgarianAnalyzer(Version matchVersion, Set<?> stopwords) {
+    super();
+    stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion,
+        stopwords));
+    this.matchVersion = matchVersion;
+  }
+  
+  /**
+   * Creates a {@link TokenStream} which tokenizes all the text in the provided
+   * {@link Reader}.
+   * 
+   * @return A {@link TokenStream} built from an {@link StandardTokenizer}
+   *         filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+   *         {@link StopFilter}, and {@link BulgarianStemFilter}.
+   */
+  @Override
+  public TokenStream tokenStream(String fieldName, Reader reader) {
+    TokenStream result = new StandardTokenizer(matchVersion, reader);
+    result = new StandardFilter(result);
+    result = new LowerCaseFilter(matchVersion, result);
+    result = new StopFilter(matchVersion, result, stoptable);
+    result = new BulgarianStemFilter(result);
+    return result;
+  }
+  
+  private class SavedStreams {
+    Tokenizer source;
+    TokenStream result;
+  };
+  
+  /**
+   * Returns a (possibly reused) {@link TokenStream} which tokenizes all the
+   * text in the provided {@link Reader}.
+   * 
+   * @return A {@link TokenStream} built from an {@link StandardTokenizer}
+   *         filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+   *         {@link StopFilter}, and {@link BulgarianStemFilter}.
+   */
+  @Override
+  public TokenStream reusableTokenStream(String fieldName, Reader reader)
+      throws IOException {
+    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
+    if (streams == null) {
+      streams = new SavedStreams();
+      streams.source = new StandardTokenizer(matchVersion, reader);
+      streams.result = new StandardFilter(streams.source);
+      streams.result = new LowerCaseFilter(matchVersion, streams.result);
+      streams.result = new StopFilter(matchVersion, streams.result, stoptable);
+      streams.result = new BulgarianStemFilter(streams.result);
+      setPreviousTokenStream(streams);
+    } else {
+      streams.source.reset(reader);
+    }
+    return streams.result;
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java?rev=886190&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java Wed Dec  2 16:08:56 2009
@@ -0,0 +1,50 @@
+package org.apache.lucene.analysis.bg;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link BulgarianStemmer} to stem Bulgarian
+ * words.
+ */
+public final class BulgarianStemFilter extends TokenFilter {
+  private final BulgarianStemmer stemmer;
+  private final TermAttribute termAtt;
+  
+  public BulgarianStemFilter(final TokenStream input) {
+    super(input);
+    stemmer = new BulgarianStemmer();
+    termAtt = addAttribute(TermAttribute.class);
+  }
+  
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      final int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength());
+      termAtt.setTermLength(newlen);
+      return true;
+    } else {
+      return false;
+    }
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java?rev=886190&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java Wed Dec  2 16:08:56 2009
@@ -0,0 +1,152 @@
+package org.apache.lucene.analysis.bg;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Light Stemmer for Bulgarian.
+ * <p>
+ * Implements the algorithm described in:  
+ * <i>
+ * Searching Strategies for the Bulgarian Language
+ * </i>
+ * http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf
+ */
+public class BulgarianStemmer {
+  
+  /**
+   * Stem an input buffer of Bulgarian text.
+   * 
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return length of input buffer after normalization
+   */
+  public int stem(final char s[], int len) {
+    if (len < 4) // do not stem
+      return len;
+    
+    if (len > 5 && endsWith(s, len, "Ð¸ÑÐ°"))
+      return len - 3;
+    
+    len = removeArticle(s, len);
+    len = removePlural(s, len);
+    
+    if (len > 3) {
+      if (endsWith(s, len, "Ñ"))
+        len--;
+      if (endsWith(s, len, "Ð°") ||
+          endsWith(s, len, "Ð¾") ||
+          endsWith(s, len, "Ðµ"))
+        len--;
+    }
+    
+    // the rule to rewrite ÐµÐ½ -> Ð½ is duplicated in the paper.
+    // in the perl implementation referenced by the paper, this is fixed.
+    // (it is fixed here as well)
+    if (len > 4 && endsWith(s, len, "ÐµÐ½")) {
+      s[len - 2] = 'Ð½'; // replace with Ð½
+      len--;
+    }
+    
+    if (len > 5 && s[len - 2] == 'Ñ') {
+      s[len - 2] = s[len - 1]; // replace ÑN with N
+      len--;
+    }
+
+    return len;
+  }
+  
+  /**
+   * Mainly remove the definite article
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return new stemmed length
+   */
+  private int removeArticle(final char s[], final int len) {
+    if (len > 6 && endsWith(s, len, "Ð¸ÑÑ"))
+      return len - 3;
+    
+    if (len > 5) {
+      if (endsWith(s, len, "ÑÑ") ||
+          endsWith(s, len, "ÑÐ¾") ||
+          endsWith(s, len, "ÑÐµ") ||
+          endsWith(s, len, "ÑÐ°") ||
+          endsWith(s, len, "Ð¸Ñ"))
+        return len - 2;
+    }
+    
+    if (len > 4 && endsWith(s, len, "ÑÑ"))
+      return len - 2;
+
+    return len;
+  }
+  
+  private int removePlural(final char s[], final int len) {
+    if (len > 6) {
+      if (endsWith(s, len, "Ð¾Ð²ÑÐ¸"))
+        return len - 3; // replace with Ð¾
+      if (endsWith(s, len, "Ð¾Ð²Ðµ"))
+        return len - 3;
+      if (endsWith(s, len, "ÐµÐ²Ðµ")) {
+        s[len - 3] = 'Ð¹'; // replace with Ð¹
+        return len - 2;
+      }
+    }
+    
+    if (len > 5) {
+      if (endsWith(s, len, "Ð¸ÑÐ°"))
+        return len - 3;
+      if (endsWith(s, len, "ÑÐ°"))
+        return len - 2;
+      if (endsWith(s, len, "ÑÐ¸")) {
+        s[len - 2] = 'Ðº'; // replace with Ðº
+        return len - 1;
+      }
+      if (endsWith(s, len, "Ð·Ð¸")) {
+        s[len - 2] = 'Ð³'; // replace with Ð³
+        return len - 1;
+      }
+      
+      if (s[len - 3] == 'Ðµ' && s[len - 1] == 'Ð¸') {
+        s[len - 3] = 'Ñ'; // replace Ðµ with Ñ, remove Ð¸
+        return len - 1;
+      }
+    }
+    
+    if (len > 4) {
+      if (endsWith(s, len, "ÑÐ¸")) {
+        s[len - 2] = 'Ñ'; // replace with Ñ
+        return len - 1;
+      }
+      if (endsWith(s, len, "Ð¸"))
+        return len - 1;
+    }
+    
+    return len;
+  }
+  
+  private boolean endsWith(final char s[], final int len, final String suffix) {
+    final int suffixLen = suffix.length();
+    if (suffixLen > len)
+      return false;
+    for (int i = suffixLen - 1; i >= 0; i--)
+      if (s[len -(suffixLen - i)] != suffix.charAt(i))
+        return false;
+    
+    return true;
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/package.html?rev=886190&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/package.html (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/package.html Wed Dec  2 16:08:56 2009
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Bulgarian.
+</body>
+</html>

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/package.html
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt?rev=886190&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt Wed Dec  2 16:08:56 2009
@@ -0,0 +1,193 @@
+# This file was created by Jacques Savoy and is distributed under the BSD license.
+# See http://members.unine.ch/jacques.savoy/clef/index.html.
+# Also see http://www.opensource.org/licenses/bsd-license.html
+Ð°
+Ð°Ð·
+Ð°ÐºÐ¾
+Ð°Ð»Ð°
+Ð±Ðµ
+Ð±ÐµÐ·
+Ð±ÐµÑÐµ
+Ð±Ð¸
+Ð±Ð¸Ð»
+Ð±Ð¸Ð»Ð°
+Ð±Ð¸Ð»Ð¸
+Ð±Ð¸Ð»Ð¾
+Ð±Ð»Ð¸Ð·Ð¾
+Ð±ÑÐ´Ð°Ñ
+Ð±ÑÐ´Ðµ
+Ð±ÑÑÐ°
+Ð²
+Ð²Ð°Ñ
+Ð²Ð°Ñ
+Ð²Ð°ÑÐ°
+Ð²ÐµÑÐ¾ÑÑÐ½Ð¾
+Ð²ÐµÑÐµ
+Ð²Ð·ÐµÐ¼Ð°
+Ð²Ð¸
+Ð²Ð¸Ðµ
+Ð²Ð¸Ð½Ð°Ð³Ð¸
+Ð²ÑÐµ
+Ð²ÑÐµÐºÐ¸
+Ð²ÑÐ¸ÑÐºÐ¸
+Ð²ÑÐ¸ÑÐºÐ¾
+Ð²ÑÑÐºÐ°
+Ð²ÑÐ²
+Ð²ÑÐ¿ÑÐµÐºÐ¸
+Ð²ÑÑÑÑ
+Ð³
+Ð³Ð¸
+Ð³Ð»Ð°Ð²Ð½Ð¾
+Ð³Ð¾
+Ð´
+Ð´Ð°
+Ð´Ð°Ð»Ð¸
+Ð´Ð¾
+Ð´Ð¾ÐºÐ°ÑÐ¾
+Ð´Ð¾ÐºÐ¾Ð³Ð°
+Ð´Ð¾ÑÐ¸
+Ð´Ð¾ÑÐµÐ³Ð°
+Ð´Ð¾ÑÑÐ°
+Ðµ
+ÐµÐ´Ð²Ð°
+ÐµÐ´Ð¸Ð½
+ÐµÑÐ¾
+Ð·Ð°
+Ð·Ð°Ð´
+Ð·Ð°ÐµÐ´Ð½Ð¾
+Ð·Ð°ÑÐ°Ð´Ð¸
+Ð·Ð°ÑÐµÐ³Ð°
+Ð·Ð°ÑÐ¾Ð²Ð°
+Ð·Ð°ÑÐ¾
+Ð·Ð°ÑÐ¾ÑÐ¾
+Ð¸
+Ð¸Ð·
+Ð¸Ð»Ð¸
+Ð¸Ð¼
+Ð¸Ð¼Ð°
+Ð¸Ð¼Ð°Ñ
+Ð¸ÑÐºÐ°
+Ð¹
+ÐºÐ°Ð·Ð°
+ÐºÐ°Ðº
+ÐºÐ°ÐºÐ²Ð°
+ÐºÐ°ÐºÐ²Ð¾
+ÐºÐ°ÐºÑÐ¾
+ÐºÐ°ÐºÑÐ²
+ÐºÐ°ÑÐ¾
+ÐºÐ¾Ð³Ð°
+ÐºÐ¾Ð³Ð°ÑÐ¾
+ÐºÐ¾ÐµÑÐ¾
+ÐºÐ¾Ð¸ÑÐ¾
+ÐºÐ¾Ð¹
+ÐºÐ¾Ð¹ÑÐ¾
+ÐºÐ¾Ð»ÐºÐ¾
+ÐºÐ¾ÑÑÐ¾
+ÐºÑÐ´Ðµ
+ÐºÑÐ´ÐµÑÐ¾
+ÐºÑÐ¼
+Ð»Ð¸
+Ð¼
+Ð¼Ðµ
+Ð¼ÐµÐ¶Ð´Ñ
+Ð¼ÐµÐ½
+Ð¼Ð¸
+Ð¼Ð½Ð¾Ð·Ð¸Ð½Ð°
+Ð¼Ð¾Ð³Ð°
+Ð¼Ð¾Ð³Ð°Ñ
+Ð¼Ð¾Ð¶Ðµ
+Ð¼Ð¾Ð»Ñ
+Ð¼Ð¾Ð¼ÐµÐ½ÑÐ°
+Ð¼Ñ
+Ð½
+Ð½Ð°
+Ð½Ð°Ð´
+Ð½Ð°Ð·Ð°Ð´
+Ð½Ð°Ð¹
+Ð½Ð°Ð¿ÑÐ°Ð²Ð¸
+Ð½Ð°Ð¿ÑÐµÐ´
+Ð½Ð°Ð¿ÑÐ¸Ð¼ÐµÑ
+Ð½Ð°Ñ
+Ð½Ðµ
+Ð½ÐµÐ³Ð¾
+Ð½ÐµÑ
+Ð½Ð¸
+Ð½Ð¸Ðµ
+Ð½Ð¸ÐºÐ¾Ð¹
+Ð½Ð¸ÑÐ¾
+Ð½Ð¾
+Ð½ÑÐºÐ¾Ð¸
+Ð½ÑÐºÐ¾Ð¹
+Ð½ÑÐ¼Ð°
+Ð¾Ð±Ð°ÑÐµ
+Ð¾ÐºÐ¾Ð»Ð¾
+Ð¾ÑÐ²ÐµÐ½
+Ð¾ÑÐ¾Ð±ÐµÐ½Ð¾
+Ð¾Ñ
+Ð¾ÑÐ³Ð¾ÑÐµ
+Ð¾ÑÐ½Ð¾Ð²Ð¾
+Ð¾ÑÐµ
+Ð¿Ð°Ðº
+Ð¿Ð¾
+Ð¿Ð¾Ð²ÐµÑÐµ
+Ð¿Ð¾Ð²ÐµÑÐµÑÐ¾
+Ð¿Ð¾Ð´
+Ð¿Ð¾Ð½Ðµ
+Ð¿Ð¾ÑÐ°Ð´Ð¸
+Ð¿Ð¾ÑÐ»Ðµ
+Ð¿Ð¾ÑÑÐ¸
+Ð¿ÑÐ°Ð²Ð¸
+Ð¿ÑÐµÐ´
+Ð¿ÑÐµÐ´Ð¸
+Ð¿ÑÐµÐ·
+Ð¿ÑÐ¸
+Ð¿ÑÐº
+Ð¿ÑÑÐ²Ð¾
+Ñ
+ÑÐ°
+ÑÐ°Ð¼Ð¾
+ÑÐµ
+ÑÐµÐ³Ð°
+ÑÐ¸
+ÑÐºÐ¾ÑÐ¾
+ÑÐ»ÐµÐ´
+ÑÐ¼Ðµ
+ÑÐ¿Ð¾ÑÐµÐ´
+ÑÑÐµÐ´
+ÑÑÐµÑÑ
+ÑÑÐµ
+ÑÑÐ¼
+ÑÑÑ
+ÑÑÑÐ¾
+Ñ
+ÑÐ°Ð·Ð¸
+ÑÐ°ÐºÐ°
+ÑÐ°ÐºÐ¸Ð²Ð°
+ÑÐ°ÐºÑÐ²
+ÑÐ°Ð¼
+ÑÐ²Ð¾Ð¹
+ÑÐµ
+ÑÐµÐ·Ð¸
+ÑÐ¸
+ÑÐ½
+ÑÐ¾
+ÑÐ¾Ð²Ð°
+ÑÐ¾Ð³Ð°Ð²Ð°
+ÑÐ¾Ð·Ð¸
+ÑÐ¾Ð¹
+ÑÐ¾Ð»ÐºÐ¾Ð²Ð°
+ÑÐ¾ÑÐ½Ð¾
+ÑÑÑÐ±Ð²Ð°
+ÑÑÐº
+ÑÑÐ¹
+ÑÑ
+ÑÑÑ
+Ñ
+ÑÐ°ÑÐµÑÐ²Ð°
+Ñ
+ÑÐµ
+ÑÐµÑÑÐ¾
+ÑÑÐµÐ·
+ÑÐµ
+ÑÐ¾Ð¼
+Ñ

Propchange: lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java?rev=886190&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java Wed Dec  2 16:08:56 2009
@@ -0,0 +1,70 @@
+package org.apache.lucene.analysis.bg;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Collections;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
+
+/**
+ * Test the Bulgarian analyzer
+ */
+public class TestBulgarianAnalyzer extends BaseTokenStreamTestCase {
+  
+  /**
+   * This test fails with NPE when the stopwords file is missing in classpath
+   */
+  public void testResourcesAvailable() {
+    new BulgarianAnalyzer(Version.LUCENE_CURRENT);
+  }
+  
+  public void testStopwords() throws IOException {
+    Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
+    assertAnalyzesTo(a, "ÐÐ°Ðº ÑÐµ ÐºÐ°Ð·Ð²Ð°Ñ?", new String[] {"ÐºÐ°Ð·Ð²Ð°Ñ"});
+  }
+  
+  public void testCustomStopwords() throws IOException {
+    Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT, Collections
+        .emptySet());
+    assertAnalyzesTo(a, "ÐÐ°Ðº ÑÐµ ÐºÐ°Ð·Ð²Ð°Ñ?", 
+        new String[] {"ÐºÐ°Ðº", "ÑÐµ", "ÐºÐ°Ð·Ð²Ð°Ñ"});
+  }
+  
+  public void testReusableTokenStream() throws IOException {
+    Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
+    assertAnalyzesToReuse(a, "Ð´Ð¾ÐºÑÐ¼ÐµÐ½ÑÐ¸", new String[] {"Ð´Ð¾ÐºÑÐ¼ÐµÐ½Ñ"});
+    assertAnalyzesToReuse(a, "Ð´Ð¾ÐºÑÐ¼ÐµÐ½Ñ", new String[] {"Ð´Ð¾ÐºÑÐ¼ÐµÐ½Ñ"});
+  }
+  
+  /**
+   * Test some examples from the paper
+   */
+  public void testBasicExamples() throws IOException {
+    Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
+    assertAnalyzesTo(a, "ÐµÐ½ÐµÑÐ³Ð¸Ð¹Ð½Ð¸ ÐºÑÐ¸Ð·Ð¸", new String[] {"ÐµÐ½ÐµÑÐ³Ð¸Ð¹Ð½", "ÐºÑÐ¸Ð·"});
+    assertAnalyzesTo(a, "ÐÑÐ¾Ð¼Ð½Ð°ÑÐ° ÐµÐ½ÐµÑÐ³Ð¸Ñ", new String[] {"Ð°ÑÐ¾Ð¼Ð½", "ÐµÐ½ÐµÑÐ³"});
+    
+    assertAnalyzesTo(a, "ÐºÐ¾Ð¼Ð¿ÑÑÑÐ¸", new String[] {"ÐºÐ¾Ð¼Ð¿ÑÑÑ"});
+    assertAnalyzesTo(a, "ÐºÐ¾Ð¼Ð¿ÑÑÑÑ", new String[] {"ÐºÐ¾Ð¼Ð¿ÑÑÑ"});
+    
+    assertAnalyzesTo(a, "Ð³ÑÐ°Ð´Ð¾Ð²Ðµ", new String[] {"Ð³ÑÐ°Ð´"});
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java?rev=886190&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java Wed Dec  2 16:08:56 2009
@@ -0,0 +1,210 @@
+package org.apache.lucene.analysis.bg;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
+
+/**
+ * Test the Bulgarian Stemmer
+ */
+public class TestBulgarianStemmer extends BaseTokenStreamTestCase {
+  /**
+   * Test showing how masculine noun forms conflate. An example noun for each
+   * common (and some rare) plural pattern is listed.
+   */
+  public void testMasculineNouns() throws IOException {
+    BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
+    
+    // -Ð¸ pattern
+    assertAnalyzesTo(a, "Ð³ÑÐ°Ð´", new String[] {"Ð³ÑÐ°Ð´"});
+    assertAnalyzesTo(a, "Ð³ÑÐ°Ð´Ð°", new String[] {"Ð³ÑÐ°Ð´"});
+    assertAnalyzesTo(a, "Ð³ÑÐ°Ð´ÑÑ", new String[] {"Ð³ÑÐ°Ð´"});
+    assertAnalyzesTo(a, "Ð³ÑÐ°Ð´Ð¾Ð²Ðµ", new String[] {"Ð³ÑÐ°Ð´"});
+    assertAnalyzesTo(a, "Ð³ÑÐ°Ð´Ð¾Ð²ÐµÑÐµ", new String[] {"Ð³ÑÐ°Ð´"});
+    
+    // -Ð¾Ð²Ðµ pattern
+    assertAnalyzesTo(a, "Ð½Ð°ÑÐ¾Ð´", new String[] {"Ð½Ð°ÑÐ¾Ð´"});
+    assertAnalyzesTo(a, "Ð½Ð°ÑÐ¾Ð´Ð°", new String[] {"Ð½Ð°ÑÐ¾Ð´"});
+    assertAnalyzesTo(a, "Ð½Ð°ÑÐ¾Ð´ÑÑ", new String[] {"Ð½Ð°ÑÐ¾Ð´"});
+    assertAnalyzesTo(a, "Ð½Ð°ÑÐ¾Ð´Ð¸", new String[] {"Ð½Ð°ÑÐ¾Ð´"});
+    assertAnalyzesTo(a, "Ð½Ð°ÑÐ¾Ð´Ð¸ÑÐµ", new String[] {"Ð½Ð°ÑÐ¾Ð´"});
+    assertAnalyzesTo(a, "Ð½Ð°ÑÐ¾Ð´Ðµ", new String[] {"Ð½Ð°ÑÐ¾Ð´"});
+    
+    // -Ð¸ÑÐ° pattern
+    assertAnalyzesTo(a, "Ð¿ÑÑ", new String[] {"Ð¿ÑÑ"});
+    assertAnalyzesTo(a, "Ð¿ÑÑÑ", new String[] {"Ð¿ÑÑ"});
+    assertAnalyzesTo(a, "Ð¿ÑÑÑÑ", new String[] {"Ð¿ÑÑ"});
+    assertAnalyzesTo(a, "Ð¿ÑÑÐ¸ÑÐ°", new String[] {"Ð¿ÑÑ"});
+    assertAnalyzesTo(a, "Ð¿ÑÑÐ¸ÑÐ°ÑÐ°", new String[] {"Ð¿ÑÑ"});
+    
+    // -ÑÐµÑÐ° pattern
+    assertAnalyzesTo(a, "Ð³ÑÐ°Ð´ÐµÑ", new String[] {"Ð³ÑÐ°Ð´ÐµÑ"});
+    assertAnalyzesTo(a, "Ð³ÑÐ°Ð´ÐµÑÐ°", new String[] {"Ð³ÑÐ°Ð´ÐµÑ"});
+    assertAnalyzesTo(a, "Ð³ÑÐ°Ð´ÐµÑÑÑ", new String[] {"Ð³ÑÐ°Ð´ÐµÑ"});
+    /* note the below forms conflate with each other, but not the rest */
+    assertAnalyzesTo(a, "Ð³ÑÐ°Ð´Ð¾Ð²ÑÐµ", new String[] {"Ð³ÑÐ°Ð´Ð¾Ð²Ñ"});
+    assertAnalyzesTo(a, "Ð³ÑÐ°Ð´Ð¾Ð²ÑÐµÑÐµ", new String[] {"Ð³ÑÐ°Ð´Ð¾Ð²Ñ"});
+    
+    // -Ð¾Ð²ÑÐ¸ pattern
+    assertAnalyzesTo(a, "Ð´ÑÐ´Ð¾", new String[] {"Ð´ÑÐ´"});
+    assertAnalyzesTo(a, "Ð´ÑÐ´Ð¾ÑÐ¾", new String[] {"Ð´ÑÐ´"});
+    assertAnalyzesTo(a, "Ð´ÑÐ´Ð¾Ð²ÑÐ¸", new String[] {"Ð´ÑÐ´"});
+    assertAnalyzesTo(a, "Ð´ÑÐ´Ð¾Ð²ÑÐ¸ÑÐµ", new String[] {"Ð´ÑÐ´"});
+    
+    // -Ðµ pattern
+    assertAnalyzesTo(a, "Ð¼ÑÐ¶", new String[] {"Ð¼ÑÐ¶"});
+    assertAnalyzesTo(a, "Ð¼ÑÐ¶Ð°", new String[] {"Ð¼ÑÐ¶"});
+    assertAnalyzesTo(a, "Ð¼ÑÐ¶Ðµ", new String[] {"Ð¼ÑÐ¶"});
+    assertAnalyzesTo(a, "Ð¼ÑÐ¶ÐµÑÐµ", new String[] {"Ð¼ÑÐ¶"});
+    assertAnalyzesTo(a, "Ð¼ÑÐ¶Ð¾", new String[] {"Ð¼ÑÐ¶"});
+    /* word is too short, will not remove -ÑÑ */
+    assertAnalyzesTo(a, "Ð¼ÑÐ¶ÑÑ", new String[] {"Ð¼ÑÐ¶ÑÑ"});
+    
+    // -Ð° pattern
+    assertAnalyzesTo(a, "ÐºÑÐ°Ðº", new String[] {"ÐºÑÐ°Ðº"});
+    assertAnalyzesTo(a, "ÐºÑÐ°ÐºÐ°", new String[] {"ÐºÑÐ°Ðº"});
+    assertAnalyzesTo(a, "ÐºÑÐ°ÐºÑÑ", new String[] {"ÐºÑÐ°Ðº"});
+    assertAnalyzesTo(a, "ÐºÑÐ°ÐºÐ°ÑÐ°", new String[] {"ÐºÑÐ°Ðº"});
+    
+    // Ð±ÑÐ°Ñ
+    assertAnalyzesTo(a, "Ð±ÑÐ°Ñ", new String[] {"Ð±ÑÐ°Ñ"});
+    assertAnalyzesTo(a, "Ð±ÑÐ°ÑÐ°", new String[] {"Ð±ÑÐ°Ñ"});
+    assertAnalyzesTo(a, "Ð±ÑÐ°ÑÑÑ", new String[] {"Ð±ÑÐ°Ñ"});
+    assertAnalyzesTo(a, "Ð±ÑÐ°ÑÑ", new String[] {"Ð±ÑÐ°Ñ"});
+    assertAnalyzesTo(a, "Ð±ÑÐ°ÑÑÑÐ°", new String[] {"Ð±ÑÐ°Ñ"});
+    assertAnalyzesTo(a, "Ð±ÑÐ°ÑÐµ", new String[] {"Ð±ÑÐ°Ñ"});
+  }
+  
+  /**
+   * Test showing how feminine noun forms conflate
+   */
+  public void testFeminineNouns() throws IOException {
+    BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
+    
+    assertAnalyzesTo(a, "Ð²ÐµÑÑ", new String[] {"Ð²ÐµÑÑ"});
+    assertAnalyzesTo(a, "Ð²ÐµÑÑÑÐ°", new String[] {"Ð²ÐµÑÑ"});
+    assertAnalyzesTo(a, "Ð²ÐµÑÑÐ¸", new String[] {"Ð²ÐµÑÑ"});
+    assertAnalyzesTo(a, "Ð²ÐµÑÑÐ¸ÑÐµ", new String[] {"Ð²ÐµÑÑ"});
+  }
+  
+  /**
+   * Test showing how neuter noun forms conflate an example noun for each common
+   * plural pattern is listed
+   */
+  public void testNeuterNouns() throws IOException {
+    BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
+    
+    // -Ð° pattern
+    assertAnalyzesTo(a, "Ð´ÑÑÐ²Ð¾", new String[] {"Ð´ÑÑÐ²"});
+    assertAnalyzesTo(a, "Ð´ÑÑÐ²Ð¾ÑÐ¾", new String[] {"Ð´ÑÑÐ²"});
+    assertAnalyzesTo(a, "Ð´ÑÑÐ²Ð°", new String[] {"Ð´ÑÑÐ²"});
+    assertAnalyzesTo(a, "Ð´ÑÑÐ²ÐµÑÐ°", new String[] {"Ð´ÑÑÐ²"});
+    assertAnalyzesTo(a, "Ð´ÑÑÐ²Ð°ÑÐ°", new String[] {"Ð´ÑÑÐ²"});
+    assertAnalyzesTo(a, "Ð´ÑÑÐ²ÐµÑÐ°ÑÐ°", new String[] {"Ð´ÑÑÐ²"});
+    
+    // -ÑÐ° pattern
+    assertAnalyzesTo(a, "Ð¼Ð¾ÑÐµ", new String[] {"Ð¼Ð¾Ñ"});
+    assertAnalyzesTo(a, "Ð¼Ð¾ÑÐµÑÐ¾", new String[] {"Ð¼Ð¾Ñ"});
+    assertAnalyzesTo(a, "Ð¼Ð¾ÑÐµÑÐ°", new String[] {"Ð¼Ð¾Ñ"});
+    assertAnalyzesTo(a, "Ð¼Ð¾ÑÐµÑÐ°ÑÐ°", new String[] {"Ð¼Ð¾Ñ"});
+    
+    // -Ñ pattern
+    assertAnalyzesTo(a, "Ð¸Ð·ÐºÐ»ÑÑÐµÐ½Ð¸Ðµ", new String[] {"Ð¸Ð·ÐºÐ»ÑÑÐµÐ½Ð¸"});
+    assertAnalyzesTo(a, "Ð¸Ð·ÐºÐ»ÑÑÐµÐ½Ð¸ÐµÑÐ¾", new String[] {"Ð¸Ð·ÐºÐ»ÑÑÐµÐ½Ð¸"});
+    assertAnalyzesTo(a, "Ð¸Ð·ÐºÐ»ÑÑÐµÐ½Ð¸ÑÑÐ°", new String[] {"Ð¸Ð·ÐºÐ»ÑÑÐµÐ½Ð¸"});
+    /* note the below form in this example does not conflate with the rest */
+    assertAnalyzesTo(a, "Ð¸Ð·ÐºÐ»ÑÑÐµÐ½Ð¸Ñ", new String[] {"Ð¸Ð·ÐºÐ»ÑÑÐ½"});
+  }
+  
+  /**
+   * Test showing how adjectival forms conflate
+   */
+  public void testAdjectives() throws IOException {
+    BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
+    assertAnalyzesTo(a, "ÐºÑÐ°ÑÐ¸Ð²", new String[] {"ÐºÑÐ°ÑÐ¸Ð²"});
+    assertAnalyzesTo(a, "ÐºÑÐ°ÑÐ¸Ð²Ð¸Ñ", new String[] {"ÐºÑÐ°ÑÐ¸Ð²"});
+    assertAnalyzesTo(a, "ÐºÑÐ°ÑÐ¸Ð²Ð¸ÑÑ", new String[] {"ÐºÑÐ°ÑÐ¸Ð²"});
+    assertAnalyzesTo(a, "ÐºÑÐ°ÑÐ¸Ð²Ð°", new String[] {"ÐºÑÐ°ÑÐ¸Ð²"});
+    assertAnalyzesTo(a, "ÐºÑÐ°ÑÐ¸Ð²Ð°ÑÐ°", new String[] {"ÐºÑÐ°ÑÐ¸Ð²"});
+    assertAnalyzesTo(a, "ÐºÑÐ°ÑÐ¸Ð²Ð¾", new String[] {"ÐºÑÐ°ÑÐ¸Ð²"});
+    assertAnalyzesTo(a, "ÐºÑÐ°ÑÐ¸Ð²Ð¾ÑÐ¾", new String[] {"ÐºÑÐ°ÑÐ¸Ð²"});
+    assertAnalyzesTo(a, "ÐºÑÐ°ÑÐ¸Ð²Ð¸", new String[] {"ÐºÑÐ°ÑÐ¸Ð²"});
+    assertAnalyzesTo(a, "ÐºÑÐ°ÑÐ¸Ð²Ð¸ÑÐµ", new String[] {"ÐºÑÐ°ÑÐ¸Ð²"});
+  }
+  
+  /**
+   * Test some exceptional rules, implemented as rewrites.
+   */
+  public void testExceptions() throws IOException {
+    BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
+    
+    // ÑÐ¸ -> Ðº
+    assertAnalyzesTo(a, "ÑÐ¾Ð±ÑÑÐ²ÐµÐ½Ð¸Ðº", new String[] {"ÑÐ¾Ð±ÑÑÐ²ÐµÐ½Ð¸Ðº"});
+    assertAnalyzesTo(a, "ÑÐ¾Ð±ÑÑÐ²ÐµÐ½Ð¸ÐºÐ°", new String[] {"ÑÐ¾Ð±ÑÑÐ²ÐµÐ½Ð¸Ðº"});
+    assertAnalyzesTo(a, "ÑÐ¾Ð±ÑÑÐ²ÐµÐ½Ð¸ÐºÑÑ", new String[] {"ÑÐ¾Ð±ÑÑÐ²ÐµÐ½Ð¸Ðº"});
+    assertAnalyzesTo(a, "ÑÐ¾Ð±ÑÑÐ²ÐµÐ½Ð¸ÑÐ¸", new String[] {"ÑÐ¾Ð±ÑÑÐ²ÐµÐ½Ð¸Ðº"});
+    assertAnalyzesTo(a, "ÑÐ¾Ð±ÑÑÐ²ÐµÐ½Ð¸ÑÐ¸ÑÐµ", new String[] {"ÑÐ¾Ð±ÑÑÐ²ÐµÐ½Ð¸Ðº"});
+    
+    // Ð·Ð¸ -> Ð³
+    assertAnalyzesTo(a, "Ð¿Ð¾Ð´Ð»Ð¾Ð³", new String[] {"Ð¿Ð¾Ð´Ð»Ð¾Ð³"});
+    assertAnalyzesTo(a, "Ð¿Ð¾Ð´Ð»Ð¾Ð³Ð°", new String[] {"Ð¿Ð¾Ð´Ð»Ð¾Ð³"});
+    assertAnalyzesTo(a, "Ð¿Ð¾Ð´Ð»Ð¾Ð³ÑÑ", new String[] {"Ð¿Ð¾Ð´Ð»Ð¾Ð³"});
+    assertAnalyzesTo(a, "Ð¿Ð¾Ð´Ð»Ð¾Ð·Ð¸", new String[] {"Ð¿Ð¾Ð´Ð»Ð¾Ð³"});
+    assertAnalyzesTo(a, "Ð¿Ð¾Ð´Ð»Ð¾Ð·Ð¸ÑÐµ", new String[] {"Ð¿Ð¾Ð´Ð»Ð¾Ð³"});
+    
+    // ÑÐ¸ -> Ñ
+    assertAnalyzesTo(a, "ÐºÐ¾Ð¶ÑÑ", new String[] {"ÐºÐ¾Ð¶ÑÑ"});
+    assertAnalyzesTo(a, "ÐºÐ¾Ð¶ÑÑÐ°", new String[] {"ÐºÐ¾Ð¶ÑÑ"});
+    assertAnalyzesTo(a, "ÐºÐ¾Ð¶ÑÑÑÑ", new String[] {"ÐºÐ¾Ð¶ÑÑ"});
+    assertAnalyzesTo(a, "ÐºÐ¾Ð¶ÑÑÐ¸", new String[] {"ÐºÐ¾Ð¶ÑÑ"});
+    assertAnalyzesTo(a, "ÐºÐ¾Ð¶ÑÑÐ¸ÑÐµ", new String[] {"ÐºÐ¾Ð¶ÑÑ"});
+    
+    // Ñ deletion
+    assertAnalyzesTo(a, "ÑÐµÐ½ÑÑÑ", new String[] {"ÑÐµÐ½ÑÑ"});
+    assertAnalyzesTo(a, "ÑÐµÐ½ÑÑÑÐ°", new String[] {"ÑÐµÐ½ÑÑ"});
+    assertAnalyzesTo(a, "ÑÐµÐ½ÑÑÑÑÑ", new String[] {"ÑÐµÐ½ÑÑ"});
+    assertAnalyzesTo(a, "ÑÐµÐ½ÑÑÐ¾Ð²Ðµ", new String[] {"ÑÐµÐ½ÑÑ"});
+    assertAnalyzesTo(a, "ÑÐµÐ½ÑÑÐ¾Ð²ÐµÑÐµ", new String[] {"ÑÐµÐ½ÑÑ"});
+    
+    // Ðµ*Ð¸ -> Ñ*
+    assertAnalyzesTo(a, "Ð¿ÑÐ¾Ð¼ÑÐ½Ð°", new String[] {"Ð¿ÑÐ¾Ð¼ÑÐ½"});
+    assertAnalyzesTo(a, "Ð¿ÑÐ¾Ð¼ÑÐ½Ð°ÑÐ°", new String[] {"Ð¿ÑÐ¾Ð¼ÑÐ½"});
+    assertAnalyzesTo(a, "Ð¿ÑÐ¾Ð¼ÐµÐ½Ð¸", new String[] {"Ð¿ÑÐ¾Ð¼ÑÐ½"});
+    assertAnalyzesTo(a, "Ð¿ÑÐ¾Ð¼ÐµÐ½Ð¸ÑÐµ", new String[] {"Ð¿ÑÐ¾Ð¼ÑÐ½"});
+    
+    // ÐµÐ½ -> Ð½
+    assertAnalyzesTo(a, "Ð¿ÐµÑÐµÐ½", new String[] {"Ð¿ÐµÑÐ½"});
+    assertAnalyzesTo(a, "Ð¿ÐµÑÐµÐ½ÑÐ°", new String[] {"Ð¿ÐµÑÐ½"});
+    assertAnalyzesTo(a, "Ð¿ÐµÑÐ½Ð¸", new String[] {"Ð¿ÐµÑÐ½"});
+    assertAnalyzesTo(a, "Ð¿ÐµÑÐ½Ð¸ÑÐµ", new String[] {"Ð¿ÐµÑÐ½"});
+    
+    // -ÐµÐ²Ðµ -> Ð¹
+    // note: this is the only word i think this rule works for.
+    // most -ÐµÐ²Ðµ pluralized nouns are monosyllabic,
+    // and the stemmer requires length > 6...
+    assertAnalyzesTo(a, "ÑÑÑÐ¾Ð¹", new String[] {"ÑÑÑÐ¾Ð¹"});
+    assertAnalyzesTo(a, "ÑÑÑÐ¾ÐµÐ²Ðµ", new String[] {"ÑÑÑÐ¾Ð¹"});
+    assertAnalyzesTo(a, "ÑÑÑÐ¾ÐµÐ²ÐµÑÐµ", new String[] {"ÑÑÑÐ¾Ð¹"});
+    /* note the below forms conflate with each other, but not the rest */
+    assertAnalyzesTo(a, "ÑÑÑÐ¾Ñ", new String[] {"ÑÑÑ"});
+    assertAnalyzesTo(a, "ÑÑÑÐ¾ÑÑ", new String[] {"ÑÑÑ"});
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java
------------------------------------------------------------------------------
    svn:eol-style = native