lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From uschind...@apache.org
Subject svn commit: r1702118 - in /lucene/dev/trunk/lucene: ./ analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/ analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/
Date Wed, 09 Sep 2015 21:48:38 GMT
Author: uschindler
Date: Wed Sep  9 21:48:38 2015
New Revision: 1702118

URL: http://svn.apache.org/r1702118
Log:
LUCENE-6775: Improved MorfologikFilterFactory to allow loading of custom dictionaries from
ResourceLoader

Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java
    lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1702118&r1=1702117&r2=1702118&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Wed Sep  9 21:48:38 2015
@@ -144,6 +144,9 @@ Other
 * LUCENE-6761: MatchAllDocsQuery's Scorers do not expose approximations
   anymore. (Adrien Grand)
 
+* LUCENE-6775: Improved MorfologikFilterFactory to allow loading of
+  custom dictionaries from ResourceLoader.  (Uwe Schindler)
+
 Build
 
 * LUCENE-6732: Improve checker for invalid source patterns to also

Modified: lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java?rev=1702118&r1=1702117&r2=1702118&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java
(original)
+++ lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java
Wed Sep  9 21:48:38 2015
@@ -17,15 +17,23 @@ package org.apache.lucene.analysis.morfo
  * limitations under the License.
  */
 
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Locale;
 import java.util.Map;
+import java.util.Objects;
+
+import morfologik.stemming.Dictionary;
 
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoaderAware;
 import org.apache.lucene.analysis.util.TokenFilterFactory;
 
 /**
  * Filter factory for {@link MorfologikFilter}. For backward compatibility polish
  * dictionary is used as default. You can change dictionary resource 
- * by dictionary-resource parameter.
+ * by dictionary-resource parameter:
  * <pre class="prettyprint">
  * &lt;fieldType name="text_polish" class="solr.TextField" positionIncrementGap="100"&gt;
  *   &lt;analyzer&gt;
@@ -34,27 +42,63 @@ import org.apache.lucene.analysis.util.T
  *   &lt;/analyzer&gt;
  * &lt;/fieldType&gt;</pre>
  * 
+ * <p>Alternatively, you can pass in the filenames of FSA ({@code ".dict"} and features
"{@code ".info"}" file
+ * (if the features file is not given, its name is derived from the FSA file):
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_polish" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.MorfologikFilterFactory" dictionary-fsa-file="mylang.dict"
dictionary-features-file="mylang.info" /&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ * 
  * @see <a href="http://morfologik.blogspot.com/">Morfologik web site</a>
  */
-public class MorfologikFilterFactory extends TokenFilterFactory {
+public class MorfologikFilterFactory extends TokenFilterFactory implements ResourceLoaderAware
{
   /**
    * The default dictionary resource (for Polish). 
    */
   public static final String DEFAULT_DICTIONARY_RESOURCE = "pl";
 
-  /**
-   * Stemming dictionary resource. See {@link MorfologikAnalyzer} for more details. 
-   */
-  private final String dictionaryResource;
-
   /** Dictionary resource */
   public static final String DICTIONARY_RESOURCE_ATTRIBUTE = "dictionary-resource";
 
+  /** Dictionary FSA file (should have {@code ".dict"} suffix), loaded from {@link ResourceLoader}.
*/
+  public static final String DICTIONARY_FSA_FILE_ATTRIBUTE = "dictionary-fsa-file";
+
+  /** Dictionary features/properties file, loaded from {@link ResourceLoader}. If not given,
this
+   * loads the file with same name like {@link #DICTIONARY_FSA_FILE_ATTRIBUTE}, but with
+   * {@code ".info"} suffix.
+   */
+  public static final String DICTIONARY_FEATURES_FILE_ATTRIBUTE = "dictionary-features-file";
+
+  private final String dictionaryFsaFile, dictionaryFeaturesFile, dictionaryResource;
+  private Dictionary dictionary; // initialized on inform()
+
   /** Creates a new MorfologikFilterFactory */
   public MorfologikFilterFactory(Map<String,String> args) {
     super(args);
 
-    dictionaryResource = get(args, DICTIONARY_RESOURCE_ATTRIBUTE, DEFAULT_DICTIONARY_RESOURCE);
+    // first check FSA and features (at least FSA must be given, features name is guessed):
+    dictionaryFsaFile = get(args, DICTIONARY_FSA_FILE_ATTRIBUTE);
+    dictionaryFeaturesFile = get(args, DICTIONARY_FEATURES_FILE_ATTRIBUTE,
+        (dictionaryFsaFile == null) ? null : Dictionary.getExpectedFeaturesName(dictionaryFsaFile));
+    
+    if (dictionaryFsaFile == null && dictionaryFeaturesFile == null) {
+      // if we have no FSA/features combination, we resolve the classpath resource:
+      dictionaryResource = get(args, DICTIONARY_RESOURCE_ATTRIBUTE, DEFAULT_DICTIONARY_RESOURCE);
+    } else if (dictionaryFsaFile == null || dictionaryFeaturesFile == null) {
+      // if we have incomplete FSA/features tuple in args
+      throw new IllegalArgumentException(String.format(Locale.ENGLISH, "Missing '%s' or '%s'
attribute.",
+          DICTIONARY_FSA_FILE_ATTRIBUTE, DICTIONARY_FEATURES_FILE_ATTRIBUTE));      
+    } else {
+      dictionaryResource = null;
+      if (get(args, DICTIONARY_RESOURCE_ATTRIBUTE) != null) {
+        // fail if both is given: FSA/features files + classpath resource
+        throw new IllegalArgumentException(String.format(Locale.ENGLISH, "Cannot give '%s'
and '%s'/'%s' at the same time.",
+            DICTIONARY_RESOURCE_ATTRIBUTE, DICTIONARY_FSA_FILE_ATTRIBUTE, DICTIONARY_FEATURES_FILE_ATTRIBUTE));
+      }
+    }
     
     if (!args.isEmpty()) {
       throw new IllegalArgumentException("Unknown parameters: " + args);
@@ -62,7 +106,22 @@ public class MorfologikFilterFactory ext
   }
 
   @Override
+  public void inform(ResourceLoader loader) throws IOException {
+    if (dictionaryFsaFile != null) {
+      assert dictionaryFeaturesFile != null;
+      assert dictionaryResource == null;
+      try (final InputStream dictIn = loader.openResource(dictionaryFsaFile);
+          final InputStream metaIn = loader.openResource(dictionaryFeaturesFile)) {
+        this.dictionary = Dictionary.readAndClose(dictIn, metaIn);
+      }
+    } else {
+      assert dictionaryResource != null;
+      this.dictionary = MorfologikFilter.loadDictionaryResource(dictionaryResource);
+    }
+  }
+
+  @Override
   public TokenStream create(TokenStream ts) {
-    return new MorfologikFilter(ts, dictionaryResource);
+    return new MorfologikFilter(ts, Objects.requireNonNull(dictionary, "MorfologikFilterFactory
was not fully initialized."));
   }
 }

Modified: lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java?rev=1702118&r1=1702117&r2=1702118&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java
(original)
+++ lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java
Wed Sep  9 21:48:38 2015
@@ -20,17 +20,57 @@ package org.apache.lucene.analysis.morfo
 import java.io.StringReader;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.Map;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.ClasspathResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoader;
 
 /**
  * Test for {@link MorfologikFilterFactory}.
  */
 public class TestMorfologikFilterFactory extends BaseTokenStreamTestCase {
-  public void testCreateDictionary() throws Exception {
+  final ResourceLoader loader = new ClasspathResourceLoader(getClass());
+
+  public void testDefaultDictionary() throws Exception {
     StringReader reader = new StringReader("rowery bilety");
     MorfologikFilterFactory factory = new MorfologikFilterFactory(Collections.<String,String>emptyMap());
+    factory.inform(loader);
+    TokenStream stream = whitespaceMockTokenizer(reader);
+    stream = factory.create(stream);
+    assertTokenStreamContents(stream, new String[] {"rower", "bilet"});
+  }
+  
+  public void testResourceDictionary() throws Exception {
+    StringReader reader = new StringReader("rowery bilety");
+    Map<String,String> params = new HashMap<>();
+    params.put(MorfologikFilterFactory.DICTIONARY_RESOURCE_ATTRIBUTE, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE);
+    MorfologikFilterFactory factory = new MorfologikFilterFactory(params);
+    factory.inform(loader);
+    TokenStream stream = whitespaceMockTokenizer(reader);
+    stream = factory.create(stream);
+    assertTokenStreamContents(stream, new String[] {"rower", "bilet"});
+  }
+  
+  public void testResourceLoaderDictionary1() throws Exception {
+    StringReader reader = new StringReader("rowery bilety");
+    Map<String,String> params = new HashMap<>();
+    params.put(MorfologikFilterFactory.DICTIONARY_FSA_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.dict");
+    MorfologikFilterFactory factory = new MorfologikFilterFactory(params);
+    factory.inform(loader);
+    TokenStream stream = whitespaceMockTokenizer(reader);
+    stream = factory.create(stream);
+    assertTokenStreamContents(stream, new String[] {"rower", "bilet"});
+  }
+  
+  public void testResourceLoaderDictionary2() throws Exception {
+    StringReader reader = new StringReader("rowery bilety");
+    Map<String,String> params = new HashMap<>();
+    params.put(MorfologikFilterFactory.DICTIONARY_FSA_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.dict");
+    params.put(MorfologikFilterFactory.DICTIONARY_FEATURES_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.info");
+    MorfologikFilterFactory factory = new MorfologikFilterFactory(params);
+    factory.inform(loader);
     TokenStream stream = whitespaceMockTokenizer(reader);
     stream = factory.create(stream);
     assertTokenStreamContents(stream, new String[] {"rower", "bilet"});
@@ -39,12 +79,48 @@ public class TestMorfologikFilterFactory
   /** Test that bogus arguments result in exception */
   public void testBogusArguments() throws Exception {
     try {
-      HashMap<String,String> map = new HashMap<String,String>();
-      map.put("bogusArg", "bogusValue");
-      new MorfologikFilterFactory(map);
+      HashMap<String,String> params = new HashMap<String,String>();
+      params.put("bogusArg", "bogusValue");
+      new MorfologikFilterFactory(params);
       fail();
     } catch (IllegalArgumentException expected) {
       assertTrue(expected.getMessage().contains("Unknown parameters"));
     }
   }
+  
+  public void testIncompatibleArgs1() throws Exception {
+    try {
+      HashMap<String,String> params = new HashMap<String,String>();
+      params.put(MorfologikFilterFactory.DICTIONARY_RESOURCE_ATTRIBUTE, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE);
+      params.put(MorfologikFilterFactory.DICTIONARY_FSA_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.dict");
+      new MorfologikFilterFactory(params);
+      fail();
+    } catch (IllegalArgumentException expected) {
+      assertTrue(expected.getMessage().contains("at the same time"));
+    }
+  }
+  
+  public void testIncompatibleArgs2() throws Exception {
+    try {
+      HashMap<String,String> params = new HashMap<String,String>();
+      params.put(MorfologikFilterFactory.DICTIONARY_RESOURCE_ATTRIBUTE, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE);
+      params.put(MorfologikFilterFactory.DICTIONARY_FSA_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.dict");
+      params.put(MorfologikFilterFactory.DICTIONARY_FEATURES_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.info");
+      new MorfologikFilterFactory(params);
+      fail();
+    } catch (IllegalArgumentException expected) {
+      assertTrue(expected.getMessage().contains("at the same time"));
+    }
+  }
+  
+  public void testMissingArgs1() throws Exception {
+    try {
+      HashMap<String,String> params = new HashMap<String,String>();
+      params.put(MorfologikFilterFactory.DICTIONARY_FEATURES_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.info");
+      new MorfologikFilterFactory(params);
+      fail();
+    } catch (IllegalArgumentException expected) {
+      assertTrue(expected.getMessage().contains("Missing"));
+    }
+  }
 }



Mime
View raw message