lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jan...@apache.org
Subject svn commit: r1175200 - in /lucene/dev/trunk/solr: ./ core/src/java/org/apache/solr/analysis/ core/src/test-files/solr/conf/ core/src/test/org/apache/solr/analysis/
Date Sat, 24 Sep 2011 17:17:27 GMT
Author: janhoy
Date: Sat Sep 24 17:17:27 2011
New Revision: 1175200

URL: http://svn.apache.org/viewvc?rev=1175200&view=rev
Log:
SOLR-2769: Added factory for the new Hunspell stemmer (janhoy, cmale)

Added:
    lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java
  (with props)
    lucene/dev/trunk/solr/core/src/test-files/solr/conf/hunspell-test.aff
    lucene/dev/trunk/solr/core/src/test-files/solr/conf/hunspell-test.dic
    lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestHunspellStemFilterFactory.java
  (with props)
Modified:
    lucene/dev/trunk/solr/CHANGES.txt

Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1175200&r1=1175199&r2=1175200&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Sat Sep 24 17:17:27 2011
@@ -349,6 +349,9 @@ New Features
 * SOLR-2066,SOLR-2776: Added support for distributed grouping.
   (Martijn van Groningen, Jasper van Veghel, Matt Beaumont)
 
+* SOLR-2769: Added factory for the new Hunspell stemmer capable of doing stemming 
+  for 99 languages (janhoy, cmale)
+
 Bug Fixes
 ----------------------
 * SOLR-2748: The CommitTracker used for commitWith or autoCommit by maxTime

Added: lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java?rev=1175200&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java
(added)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java
Sat Sep 24 17:17:27 2011
@@ -0,0 +1,75 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.hunspell.HunspellDictionary;
+import org.apache.lucene.analysis.hunspell.HunspellStemFilter;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+
+/**
+ * TokenFilterFactory that creates instances of {@link org.apache.lucene.analysis.hunspell.HunspellStemFilter}.
+ * Example config for British English including a custom dictionary:
+ * <pre class="prettyprint" >
+ * &lt;filter class=&quot;solr.HunspellStemFilterFactory&quot;
+ *    dictionary=&quot;en_GB.dic,my_custom.dic&quot;
+ *    affix=&quot;en_GB.aff&quot;/&gt;</pre>
+ * Dictionaries for many languages are available through the OpenOffice project
+ * @see http://wiki.services.openoffice.org/wiki/Dictionaries
+ */
+public class HunspellStemFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware
{
+  
+  private HunspellDictionary dictionary;
+
+  /**
+   * Loads the hunspell dictionary and affix files defined in the configuration
+   *  
+   * @param loader ResourceLoader used to load the files
+   */
+  public void inform(ResourceLoader loader) {
+    assureMatchVersion();
+    String dictionaryFiles[] = args.get("dictionary").split(",");
+    String affixFile = args.get("affix");
+
+    try {
+      List<InputStream> dictionaries = new ArrayList<InputStream>();
+      for (String file : dictionaryFiles) {
+        dictionaries.add(loader.openResource(file));
+      }
+      this.dictionary = new HunspellDictionary(loader.openResource(affixFile), dictionaries,
luceneMatchVersion);
+    } catch (Exception e) {
+      throw new RuntimeException("Unable to load hunspell data! [dictionary=" + args.get("dictionary")
+ ",affix=" + affixFile + "]", e);
+    }
+  }
+
+  /**
+   * Creates an instance of {@link org.apache.lucene.analysis.hunspell.HunspellStemFilter}
that will filter the given
+   * TokenStream
+   *
+   * @param tokenStream TokenStream that will be filtered
+   * @return HunspellStemFilter that filters the TokenStream 
+   */
+  public TokenStream create(TokenStream tokenStream) {
+    return new HunspellStemFilter(tokenStream, dictionary);
+  }
+}

Added: lucene/dev/trunk/solr/core/src/test-files/solr/conf/hunspell-test.aff
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test-files/solr/conf/hunspell-test.aff?rev=1175200&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/test-files/solr/conf/hunspell-test.aff (added)
+++ lucene/dev/trunk/solr/core/src/test-files/solr/conf/hunspell-test.aff Sat Sep 24 17:17:27
2011
@@ -0,0 +1,13 @@
+SET UTF-8
+TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
+
+SFX A Y 2
+SFX A   0     e         n
+SFX A   0     e         t
+
+SFX C Y 2
+SFX C   0     d/C       c
+SFX C   0     c         b
+
+PFX B Y 1
+PFX B   0     s         o
\ No newline at end of file

Added: lucene/dev/trunk/solr/core/src/test-files/solr/conf/hunspell-test.dic
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test-files/solr/conf/hunspell-test.dic?rev=1175200&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/test-files/solr/conf/hunspell-test.dic (added)
+++ lucene/dev/trunk/solr/core/src/test-files/solr/conf/hunspell-test.dic Sat Sep 24 17:17:27
2011
@@ -0,0 +1,6 @@
+5
+lucen/A
+lucene
+mahout/A
+olr/B
+ab/C
\ No newline at end of file

Added: lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestHunspellStemFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestHunspellStemFilterFactory.java?rev=1175200&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestHunspellStemFilterFactory.java
(added)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestHunspellStemFilterFactory.java
Sat Sep 24 17:17:27 2011
@@ -0,0 +1,47 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.solr.core.SolrResourceLoader;
+import org.apache.solr.schema.IndexSchema;
+
+/**
+ * Simple tests to ensure the Hunspell stemmer loads from factory
+ */
+public class TestHunspellStemFilterFactory extends BaseTokenTestCase {
+  public void testStemming() throws Exception {
+    HunspellStemFilterFactory factory = new HunspellStemFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("dictionary", "hunspell-test.dic");
+    args.put("affix", "hunspell-test.aff");
+    args.put(IndexSchema.LUCENE_MATCH_VERSION_PARAM, DEFAULT_VERSION.name());
+    factory.init(args);
+    factory.inform(new SolrResourceLoader("solr"));
+    
+    Reader reader = new StringReader("abc");
+    TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
+    assertTokenStreamContents(stream, new String[] { "ab" });
+  }
+}



Mime
View raw message