Return-Path: Delivered-To: apmail-lucene-java-commits-archive@www.apache.org Received: (qmail 83337 invoked from network); 2 Dec 2009 16:09:33 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.3) by minotaur.apache.org with SMTP; 2 Dec 2009 16:09:33 -0000 Received: (qmail 74877 invoked by uid 500); 2 Dec 2009 16:09:32 -0000 Delivered-To: apmail-lucene-java-commits-archive@lucene.apache.org Received: (qmail 74789 invoked by uid 500); 2 Dec 2009 16:09:32 -0000 Mailing-List: contact java-commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: java-dev@lucene.apache.org Delivered-To: mailing list java-commits@lucene.apache.org Received: (qmail 74780 invoked by uid 99); 2 Dec 2009 16:09:32 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 02 Dec 2009 16:09:32 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=10.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 02 Dec 2009 16:09:19 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id 971BC23888EC; Wed, 2 Dec 2009 16:08:57 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: svn commit: r886190 - in /lucene/java/trunk: ./ contrib/ contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/ contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/ contrib/analyzers/common/src/test/org/apache/lucene/analysis... Date: Wed, 02 Dec 2009 16:08:57 -0000 To: java-commits@lucene.apache.org From: rmuir@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20091202160857.971BC23888EC@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: rmuir Date: Wed Dec 2 16:08:56 2009 New Revision: 886190 URL: http://svn.apache.org/viewvc?rev=886190&view=rev Log: LUCENE-2062: Bulgarian Analyzer Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java (with props) lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java (with props) lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java (with props) lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/package.html (with props) lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/ lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt (with props) lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java (with props) lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java (with props) Modified: lucene/java/trunk/NOTICE.txt lucene/java/trunk/contrib/CHANGES.txt Modified: lucene/java/trunk/NOTICE.txt URL: http://svn.apache.org/viewvc/lucene/java/trunk/NOTICE.txt?rev=886190&r1=886189&r2=886190&view=diff ============================================================================== --- lucene/java/trunk/NOTICE.txt (original) +++ lucene/java/trunk/NOTICE.txt Wed Dec 2 16:08:56 2009 @@ -20,6 +20,11 @@ contrib/analyzers/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt. See http://members.unine.ch/jacques.savoy/clef/index.html. +The Bulgarian analyzer (contrib/analyzers) comes with a default +stopword list that is BSD-licensed created by Jacques Savoy. The file resides in +contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt. +See http://members.unine.ch/jacques.savoy/clef/index.html. + Includes lib/servlet-api-2.4.jar from Apache Tomcat The SmartChineseAnalyzer source code (under contrib/analyzers) was Modified: lucene/java/trunk/contrib/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=886190&r1=886189&r2=886190&view=diff ============================================================================== --- lucene/java/trunk/contrib/CHANGES.txt (original) +++ lucene/java/trunk/contrib/CHANGES.txt Wed Dec 2 16:08:56 2009 @@ -15,6 +15,8 @@ * LUCENE-2067: Add a Czech light stemmer. CzechAnalyzer will now stem words when Version is set to 3.1 or higher. (Robert Muir) + * LUCENE-2062: Add a Bulgarian analyzer. (Robert Muir, Simon Willnauer) + ======================= Release 3.0.0 2009-11-25 ======================= Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java?rev=886190&view=auto ============================================================================== --- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java (added) +++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java Wed Dec 2 16:08:56 2009 @@ -0,0 +1,176 @@ +package org.apache.lucene.analysis.bg; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.util.Collections; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.WordlistLoader; +import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.util.Version; + +/** + * {@link Analyzer} for Bulgarian. + *

+ * This analyzer implements light-stemming as specified by: Searching + * Strategies for the Bulgarian Language + * http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf + *

+ */ +public final class BulgarianAnalyzer extends Analyzer { + + /** + * File containing default Bulgarian stopwords. + * + * Default stopword list is from + * http://members.unine.ch/jacques.savoy/clef/index.html The stopword list is + * BSD-Licensed. + */ + public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; + + /** + * Contains the stopwords used with the StopFilter. + */ + private final Set stoptable; + /** + * The comment character in the stopwords file. All lines prefixed with this + * will be ignored + */ + public static final String STOPWORDS_COMMENT = "#"; + + /** + * Returns an unmodifiable instance of the default stop-words set. + * + * @return an unmodifiable instance of the default stop-words set. + */ + public static Set getDefaultStopSet() { + return DefaultSetHolder.DEFAULT_STOP_SET; + } + + /** + * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer + * class accesses the static final set the first time.; + */ + private static class DefaultSetHolder { + static final Set DEFAULT_STOP_SET; + + static { + try { + DEFAULT_STOP_SET = loadDefaultStopWordSet(); + } catch (Exception ex) { + // default set should always be present as it is part of the + // distribution (JAR) + throw new RuntimeException("Unable to load default stopword set", ex); + } + } + + static Set loadDefaultStopWordSet() throws IOException { + final InputStream stream = BulgarianAnalyzer.class + .getResourceAsStream(DEFAULT_STOPWORD_FILE); + try { + InputStreamReader reader = new InputStreamReader(stream, "UTF-8"); + // make sure it is unmodifiable as we expose it in the outer class + return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader, + STOPWORDS_COMMENT)); + } finally { + if(stream != null) + stream.close(); + } + } + } + + private final Version matchVersion; + + /** + * Builds an analyzer with the default stop words: + * {@link #DEFAULT_STOPWORD_FILE}. + */ + public BulgarianAnalyzer(Version matchVersion) { + this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET); + } + + /** + * Builds an analyzer with the given stop words. + */ + public BulgarianAnalyzer(Version matchVersion, Set stopwords) { + super(); + stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, + stopwords)); + this.matchVersion = matchVersion; + } + + /** + * Creates a {@link TokenStream} which tokenizes all the text in the provided + * {@link Reader}. + * + * @return A {@link TokenStream} built from an {@link StandardTokenizer} + * filtered with {@link StandardFilter}, {@link LowerCaseFilter}, + * {@link StopFilter}, and {@link BulgarianStemFilter}. + */ + @Override + public TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream result = new StandardTokenizer(matchVersion, reader); + result = new StandardFilter(result); + result = new LowerCaseFilter(matchVersion, result); + result = new StopFilter(matchVersion, result, stoptable); + result = new BulgarianStemFilter(result); + return result; + } + + private class SavedStreams { + Tokenizer source; + TokenStream result; + }; + + /** + * Returns a (possibly reused) {@link TokenStream} which tokenizes all the + * text in the provided {@link Reader}. + * + * @return A {@link TokenStream} built from an {@link StandardTokenizer} + * filtered with {@link StandardFilter}, {@link LowerCaseFilter}, + * {@link StopFilter}, and {@link BulgarianStemFilter}. + */ + @Override + public TokenStream reusableTokenStream(String fieldName, Reader reader) + throws IOException { + SavedStreams streams = (SavedStreams) getPreviousTokenStream(); + if (streams == null) { + streams = new SavedStreams(); + streams.source = new StandardTokenizer(matchVersion, reader); + streams.result = new StandardFilter(streams.source); + streams.result = new LowerCaseFilter(matchVersion, streams.result); + streams.result = new StopFilter(matchVersion, streams.result, stoptable); + streams.result = new BulgarianStemFilter(streams.result); + setPreviousTokenStream(streams); + } else { + streams.source.reset(reader); + } + return streams.result; + } +} Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java?rev=886190&view=auto ============================================================================== --- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java (added) +++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java Wed Dec 2 16:08:56 2009 @@ -0,0 +1,50 @@ +package org.apache.lucene.analysis.bg; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; + +/** + * A {@link TokenFilter} that applies {@link BulgarianStemmer} to stem Bulgarian + * words. + */ +public final class BulgarianStemFilter extends TokenFilter { + private final BulgarianStemmer stemmer; + private final TermAttribute termAtt; + + public BulgarianStemFilter(final TokenStream input) { + super(input); + stemmer = new BulgarianStemmer(); + termAtt = addAttribute(TermAttribute.class); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + final int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength()); + termAtt.setTermLength(newlen); + return true; + } else { + return false; + } + } +} Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java?rev=886190&view=auto ============================================================================== --- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java (added) +++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java Wed Dec 2 16:08:56 2009 @@ -0,0 +1,152 @@ +package org.apache.lucene.analysis.bg; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Light Stemmer for Bulgarian. + *

+ * Implements the algorithm described in: + * + * Searching Strategies for the Bulgarian Language + * + * http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf + */ +public class BulgarianStemmer { + + /** + * Stem an input buffer of Bulgarian text. + * + * @param s input buffer + * @param len length of input buffer + * @return length of input buffer after normalization + */ + public int stem(final char s[], int len) { + if (len < 4) // do not stem + return len; + + if (len > 5 && endsWith(s, len, "ища")) + return len - 3; + + len = removeArticle(s, len); + len = removePlural(s, len); + + if (len > 3) { + if (endsWith(s, len, "я")) + len--; + if (endsWith(s, len, "а") || + endsWith(s, len, "о") || + endsWith(s, len, "е")) + len--; + } + + // the rule to rewrite ен -> н is duplicated in the paper. + // in the perl implementation referenced by the paper, this is fixed. + // (it is fixed here as well) + if (len > 4 && endsWith(s, len, "ен")) { + s[len - 2] = 'н'; // replace with н + len--; + } + + if (len > 5 && s[len - 2] == 'ъ') { + s[len - 2] = s[len - 1]; // replace ъN with N + len--; + } + + return len; + } + + /** + * Mainly remove the definite article + * @param s input buffer + * @param len length of input buffer + * @return new stemmed length + */ + private int removeArticle(final char s[], final int len) { + if (len > 6 && endsWith(s, len, "ият")) + return len - 3; + + if (len > 5) { + if (endsWith(s, len, "ът") || + endsWith(s, len, "то") || + endsWith(s, len, "те") || + endsWith(s, len, "та") || + endsWith(s, len, "ия")) + return len - 2; + } + + if (len > 4 && endsWith(s, len, "ят")) + return len - 2; + + return len; + } + + private int removePlural(final char s[], final int len) { + if (len > 6) { + if (endsWith(s, len, "овци")) + return len - 3; // replace with о + if (endsWith(s, len, "ове")) + return len - 3; + if (endsWith(s, len, "еве")) { + s[len - 3] = 'й'; // replace with й + return len - 2; + } + } + + if (len > 5) { + if (endsWith(s, len, "ища")) + return len - 3; + if (endsWith(s, len, "та")) + return len - 2; + if (endsWith(s, len, "ци")) { + s[len - 2] = 'к'; // replace with к + return len - 1; + } + if (endsWith(s, len, "зи")) { + s[len - 2] = 'г'; // replace with г + return len - 1; + } + + if (s[len - 3] == 'е' && s[len - 1] == 'и') { + s[len - 3] = 'я'; // replace е with я, remove и + return len - 1; + } + } + + if (len > 4) { + if (endsWith(s, len, "си")) { + s[len - 2] = 'х'; // replace with х + return len - 1; + } + if (endsWith(s, len, "и")) + return len - 1; + } + + return len; + } + + private boolean endsWith(final char s[], final int len, final String suffix) { + final int suffixLen = suffix.length(); + if (suffixLen > len) + return false; + for (int i = suffixLen - 1; i >= 0; i--) + if (s[len -(suffixLen - i)] != suffix.charAt(i)) + return false; + + return true; + } +} Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/package.html URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/package.html?rev=886190&view=auto ============================================================================== --- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/package.html (added) +++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/package.html Wed Dec 2 16:08:56 2009 @@ -0,0 +1,22 @@ + + + + +Analyzer for Bulgarian. + + Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/package.html ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt?rev=886190&view=auto ============================================================================== --- lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt (added) +++ lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt Wed Dec 2 16:08:56 2009 @@ -0,0 +1,193 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +а +аз +ако +ала +бе +без +беше +би +бил +била +били +било +близо +бъдат +бъде +бяха +в +вас +ваш +ваша +вероятно +вече +взема +ви +вие +винаги +все +всеки +всички +всичко +всяка +във +въпреки +върху +г +ги +главно +го +д +да +дали +до +докато +докога +дори +досега +доста +е +едва +един +ето +за +зад +заедно +заради +засега +затова +защо +защото +и +из +или +им +има +имат +иска +й +каза +как +каква +какво +както +какъв +като +кога +когато +което +които +кой +който +колко +която +къде +където +към +ли +м +ме +между +мен +ми +мнозина +мога +могат +може +моля +момента +му +н +на +над +назад +най +направи +напред +например +нас +не +него +нея +ни +ние +никой +нито +но +някои +някой +няма +обаче +около +освен +особено +от +отгоре +отново +още +пак +по +повече +повечето +под +поне +поради +после +почти +прави +пред +преди +през +при +пък +първо +с +са +само +се +сега +си +скоро +след +сме +според +сред +срещу +сте +съм +със +също +т +тази +така +такива +такъв +там +твой +те +тези +ти +тн +то +това +тогава +този +той +толкова +точно +трябва +тук +тъй +тя +тях +у +харесва +ч +че +често +чрез +ще +щом +я Propchange: lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java?rev=886190&view=auto ============================================================================== --- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java (added) +++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java Wed Dec 2 16:08:56 2009 @@ -0,0 +1,70 @@ +package org.apache.lucene.analysis.bg; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Collections; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.util.Version; + +/** + * Test the Bulgarian analyzer + */ +public class TestBulgarianAnalyzer extends BaseTokenStreamTestCase { + + /** + * This test fails with NPE when the stopwords file is missing in classpath + */ + public void testResourcesAvailable() { + new BulgarianAnalyzer(Version.LUCENE_CURRENT); + } + + public void testStopwords() throws IOException { + Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT); + assertAnalyzesTo(a, "Как се казваш?", new String[] {"казваш"}); + } + + public void testCustomStopwords() throws IOException { + Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT, Collections + .emptySet()); + assertAnalyzesTo(a, "Как се казваш?", + new String[] {"как", "се", "казваш"}); + } + + public void testReusableTokenStream() throws IOException { + Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT); + assertAnalyzesToReuse(a, "документи", new String[] {"документ"}); + assertAnalyzesToReuse(a, "документ", new String[] {"документ"}); + } + + /** + * Test some examples from the paper + */ + public void testBasicExamples() throws IOException { + Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT); + assertAnalyzesTo(a, "енергийни кризи", new String[] {"енергийн", "криз"}); + assertAnalyzesTo(a, "Атомната енергия", new String[] {"атомн", "енерг"}); + + assertAnalyzesTo(a, "компютри", new String[] {"компютр"}); + assertAnalyzesTo(a, "компютър", new String[] {"компютр"}); + + assertAnalyzesTo(a, "градове", new String[] {"град"}); + } +} Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java?rev=886190&view=auto ============================================================================== --- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java (added) +++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java Wed Dec 2 16:08:56 2009 @@ -0,0 +1,210 @@ +package org.apache.lucene.analysis.bg; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.util.Version; + +/** + * Test the Bulgarian Stemmer + */ +public class TestBulgarianStemmer extends BaseTokenStreamTestCase { + /** + * Test showing how masculine noun forms conflate. An example noun for each + * common (and some rare) plural pattern is listed. + */ + public void testMasculineNouns() throws IOException { + BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT); + + // -и pattern + assertAnalyzesTo(a, "град", new String[] {"град"}); + assertAnalyzesTo(a, "града", new String[] {"град"}); + assertAnalyzesTo(a, "градът", new String[] {"град"}); + assertAnalyzesTo(a, "градове", new String[] {"град"}); + assertAnalyzesTo(a, "градовете", new String[] {"град"}); + + // -ове pattern + assertAnalyzesTo(a, "народ", new String[] {"народ"}); + assertAnalyzesTo(a, "народа", new String[] {"народ"}); + assertAnalyzesTo(a, "народът", new String[] {"народ"}); + assertAnalyzesTo(a, "народи", new String[] {"народ"}); + assertAnalyzesTo(a, "народите", new String[] {"народ"}); + assertAnalyzesTo(a, "народе", new String[] {"народ"}); + + // -ища pattern + assertAnalyzesTo(a, "път", new String[] {"път"}); + assertAnalyzesTo(a, "пътя", new String[] {"път"}); + assertAnalyzesTo(a, "пътят", new String[] {"път"}); + assertAnalyzesTo(a, "пътища", new String[] {"път"}); + assertAnalyzesTo(a, "пътищата", new String[] {"път"}); + + // -чета pattern + assertAnalyzesTo(a, "градец", new String[] {"градец"}); + assertAnalyzesTo(a, "градеца", new String[] {"градец"}); + assertAnalyzesTo(a, "градецът", new String[] {"градец"}); + /* note the below forms conflate with each other, but not the rest */ + assertAnalyzesTo(a, "градовце", new String[] {"градовц"}); + assertAnalyzesTo(a, "градовцете", new String[] {"градовц"}); + + // -овци pattern + assertAnalyzesTo(a, "дядо", new String[] {"дяд"}); + assertAnalyzesTo(a, "дядото", new String[] {"дяд"}); + assertAnalyzesTo(a, "дядовци", new String[] {"дяд"}); + assertAnalyzesTo(a, "дядовците", new String[] {"дяд"}); + + // -е pattern + assertAnalyzesTo(a, "мъж", new String[] {"мъж"}); + assertAnalyzesTo(a, "мъжа", new String[] {"мъж"}); + assertAnalyzesTo(a, "мъже", new String[] {"мъж"}); + assertAnalyzesTo(a, "мъжете", new String[] {"мъж"}); + assertAnalyzesTo(a, "мъжо", new String[] {"мъж"}); + /* word is too short, will not remove -ът */ + assertAnalyzesTo(a, "мъжът", new String[] {"мъжът"}); + + // -а pattern + assertAnalyzesTo(a, "крак", new String[] {"крак"}); + assertAnalyzesTo(a, "крака", new String[] {"крак"}); + assertAnalyzesTo(a, "кракът", new String[] {"крак"}); + assertAnalyzesTo(a, "краката", new String[] {"крак"}); + + // брат + assertAnalyzesTo(a, "брат", new String[] {"брат"}); + assertAnalyzesTo(a, "брата", new String[] {"брат"}); + assertAnalyzesTo(a, "братът", new String[] {"брат"}); + assertAnalyzesTo(a, "братя", new String[] {"брат"}); + assertAnalyzesTo(a, "братята", new String[] {"брат"}); + assertAnalyzesTo(a, "брате", new String[] {"брат"}); + } + + /** + * Test showing how feminine noun forms conflate + */ + public void testFeminineNouns() throws IOException { + BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT); + + assertAnalyzesTo(a, "вест", new String[] {"вест"}); + assertAnalyzesTo(a, "вестта", new String[] {"вест"}); + assertAnalyzesTo(a, "вести", new String[] {"вест"}); + assertAnalyzesTo(a, "вестите", new String[] {"вест"}); + } + + /** + * Test showing how neuter noun forms conflate an example noun for each common + * plural pattern is listed + */ + public void testNeuterNouns() throws IOException { + BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT); + + // -а pattern + assertAnalyzesTo(a, "дърво", new String[] {"дърв"}); + assertAnalyzesTo(a, "дървото", new String[] {"дърв"}); + assertAnalyzesTo(a, "дърва", new String[] {"дърв"}); + assertAnalyzesTo(a, "дървета", new String[] {"дърв"}); + assertAnalyzesTo(a, "дървата", new String[] {"дърв"}); + assertAnalyzesTo(a, "дърветата", new String[] {"дърв"}); + + // -та pattern + assertAnalyzesTo(a, "море", new String[] {"мор"}); + assertAnalyzesTo(a, "морето", new String[] {"мор"}); + assertAnalyzesTo(a, "морета", new String[] {"мор"}); + assertAnalyzesTo(a, "моретата", new String[] {"мор"}); + + // -я pattern + assertAnalyzesTo(a, "изключение", new String[] {"изключени"}); + assertAnalyzesTo(a, "изключението", new String[] {"изключени"}); + assertAnalyzesTo(a, "изключенията", new String[] {"изключени"}); + /* note the below form in this example does not conflate with the rest */ + assertAnalyzesTo(a, "изключения", new String[] {"изключн"}); + } + + /** + * Test showing how adjectival forms conflate + */ + public void testAdjectives() throws IOException { + BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT); + assertAnalyzesTo(a, "красив", new String[] {"красив"}); + assertAnalyzesTo(a, "красивия", new String[] {"красив"}); + assertAnalyzesTo(a, "красивият", new String[] {"красив"}); + assertAnalyzesTo(a, "красива", new String[] {"красив"}); + assertAnalyzesTo(a, "красивата", new String[] {"красив"}); + assertAnalyzesTo(a, "красиво", new String[] {"красив"}); + assertAnalyzesTo(a, "красивото", new String[] {"красив"}); + assertAnalyzesTo(a, "красиви", new String[] {"красив"}); + assertAnalyzesTo(a, "красивите", new String[] {"красив"}); + } + + /** + * Test some exceptional rules, implemented as rewrites. + */ + public void testExceptions() throws IOException { + BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT); + + // ци -> к + assertAnalyzesTo(a, "собственик", new String[] {"собственик"}); + assertAnalyzesTo(a, "собственика", new String[] {"собственик"}); + assertAnalyzesTo(a, "собственикът", new String[] {"собственик"}); + assertAnalyzesTo(a, "собственици", new String[] {"собственик"}); + assertAnalyzesTo(a, "собствениците", new String[] {"собственик"}); + + // зи -> г + assertAnalyzesTo(a, "подлог", new String[] {"подлог"}); + assertAnalyzesTo(a, "подлога", new String[] {"подлог"}); + assertAnalyzesTo(a, "подлогът", new String[] {"подлог"}); + assertAnalyzesTo(a, "подлози", new String[] {"подлог"}); + assertAnalyzesTo(a, "подлозите", new String[] {"подлог"}); + + // си -> х + assertAnalyzesTo(a, "кожух", new String[] {"кожух"}); + assertAnalyzesTo(a, "кожуха", new String[] {"кожух"}); + assertAnalyzesTo(a, "кожухът", new String[] {"кожух"}); + assertAnalyzesTo(a, "кожуси", new String[] {"кожух"}); + assertAnalyzesTo(a, "кожусите", new String[] {"кожух"}); + + // ъ deletion + assertAnalyzesTo(a, "център", new String[] {"центр"}); + assertAnalyzesTo(a, "центъра", new String[] {"центр"}); + assertAnalyzesTo(a, "центърът", new String[] {"центр"}); + assertAnalyzesTo(a, "центрове", new String[] {"центр"}); + assertAnalyzesTo(a, "центровете", new String[] {"центр"}); + + // е*и -> я* + assertAnalyzesTo(a, "промяна", new String[] {"промян"}); + assertAnalyzesTo(a, "промяната", new String[] {"промян"}); + assertAnalyzesTo(a, "промени", new String[] {"промян"}); + assertAnalyzesTo(a, "промените", new String[] {"промян"}); + + // ен -> н + assertAnalyzesTo(a, "песен", new String[] {"песн"}); + assertAnalyzesTo(a, "песента", new String[] {"песн"}); + assertAnalyzesTo(a, "песни", new String[] {"песн"}); + assertAnalyzesTo(a, "песните", new String[] {"песн"}); + + // -еве -> й + // note: this is the only word i think this rule works for. + // most -еве pluralized nouns are monosyllabic, + // and the stemmer requires length > 6... + assertAnalyzesTo(a, "строй", new String[] {"строй"}); + assertAnalyzesTo(a, "строеве", new String[] {"строй"}); + assertAnalyzesTo(a, "строевете", new String[] {"строй"}); + /* note the below forms conflate with each other, but not the rest */ + assertAnalyzesTo(a, "строя", new String[] {"стр"}); + assertAnalyzesTo(a, "строят", new String[] {"стр"}); + } +} Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java ------------------------------------------------------------------------------ svn:eol-style = native