lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From chr...@apache.org
Subject svn commit: r1344988 - in /lucene/dev/branches/branch_4x: lucene/ lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ solr/core/src/java/org/apache/solr/analysis/
Date Fri, 01 Jun 2012 05:21:29 GMT
Author: chrism
Date: Fri Jun  1 05:21:28 2012
New Revision: 1344988

URL: http://svn.apache.org/viewvc?rev=1344988&view=rev
Log:
LUCENE-4019: Added support for handling rules with incorrect number of arguments

Added:
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testWrongAffixRule.aff
      - copied unchanged from r1344987, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testWrongAffixRule.aff
Modified:
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java
    lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java

Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1344988&r1=1344987&r2=1344988&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Fri Jun  1 05:21:28 2012
@@ -880,6 +880,11 @@ New features
 * LUCENE-3523: Added oal.search.spell.WordBreakSpellChecker, which 
     generates suggestions by combining two or more terms and/or 
     breaking terms into multiple words.  See Javadocs for usage. (James Dyer)
+
+* LUCENE-4019: Added improved parsing of Hunspell Dictionaries so those
+  rules missing the required number of parameters either ignored or 
+  cause a ParseException (depending on whether strict parsing is enabled).
+  (Luca Cavanna via Chris Male)
    
 Optimizations
 

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java?rev=1344988&r1=1344987&r2=1344988&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java
Fri Jun  1 05:21:28 2012
@@ -50,6 +50,7 @@ public class HunspellDictionary {
   private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
 
   private static final boolean IGNORE_CASE_DEFAULT = false;
+  private static final boolean STRICT_AFFIX_PARSING_DEFAULT = true;
 
   private CharArrayMap<List<HunspellWord>> words;
   private CharArrayMap<List<HunspellAffix>> prefixes;
@@ -104,11 +105,27 @@ public class HunspellDictionary {
    * @throws ParseException Can be thrown if the content of the files does not meet expected
formats
    */
   public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version
version, boolean ignoreCase) throws IOException, ParseException {
+    this(affix, dictionaries, version, ignoreCase, STRICT_AFFIX_PARSING_DEFAULT);
+  }
+
+  /**
+   * Creates a new HunspellDictionary containing the information read from the provided InputStreams
to hunspell affix
+   * and dictionary files
+   *
+   * @param affix InputStream for reading the hunspell affix file
+   * @param dictionaries InputStreams for reading the hunspell dictionary file
+   * @param version Lucene Version
+   * @param ignoreCase If true, dictionary matching will be case insensitive
+   * @param strictAffixParsing Affix strict parsing enabled or not (an error while reading
a rule causes exception or is ignored)
+   * @throws IOException Can be thrown while reading from the InputStreams
+   * @throws ParseException Can be thrown if the content of the files does not meet expected
formats
+   */
+  public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version
version, boolean ignoreCase, boolean strictAffixParsing) throws IOException, ParseException
{
     this.version = version;
     this.ignoreCase = ignoreCase;
     String encoding = getDictionaryEncoding(affix);
     CharsetDecoder decoder = getJavaEncoding(encoding);
-    readAffixFile(affix, decoder);
+    readAffixFile(affix, decoder, strictAffixParsing);
     words = new CharArrayMap<List<HunspellWord>>(version, 65535 /* guess */,
this.ignoreCase);
     for (InputStream dictionary : dictionaries) {
       readDictionaryFile(dictionary, decoder);
@@ -158,19 +175,19 @@ public class HunspellDictionary {
    * @param decoder CharsetDecoder to decode the content of the file
    * @throws IOException Can be thrown while reading from the InputStream
    */
-  private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException
{
+  private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, boolean strict)
throws IOException, ParseException {
     prefixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
     suffixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
-    
-    BufferedReader reader = new BufferedReader(new InputStreamReader(affixStream, decoder));
+
+    LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
     String line = null;
     while ((line = reader.readLine()) != null) {
       if (line.startsWith(ALIAS_KEY)) {
         parseAlias(line);
       } else if (line.startsWith(PREFIX_KEY)) {
-        parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN);
+        parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, strict);
       } else if (line.startsWith(SUFFIX_KEY)) {
-        parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN);
+        parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, strict);
       } else if (line.startsWith(FLAG_KEY)) {
         // Assume that the FLAG line comes before any prefix or suffixes
         // Store the strategy so it can be used when parsing the dic file
@@ -192,8 +209,9 @@ public class HunspellDictionary {
    */
   private void parseAffix(CharArrayMap<List<HunspellAffix>> affixes,
                           String header,
-                          BufferedReader reader,
-                          String conditionPattern) throws IOException {
+                          LineNumberReader reader,
+                          String conditionPattern,
+                          boolean strict) throws IOException, ParseException {
     String args[] = header.split("\\s+");
 
     boolean crossProduct = args[2].equals("Y");
@@ -203,6 +221,13 @@ public class HunspellDictionary {
       String line = reader.readLine();
       String ruleArgs[] = line.split("\\s+");
 
+      if (ruleArgs.length < 5) {
+        if (strict) {
+          throw new ParseException("The affix file contains a rule with less than five elements",
reader.getLineNumber());
+        }
+        continue;
+      }
+
       HunspellAffix affix = new HunspellAffix();
       
       affix.setFlag(flagParsingStrategy.parseFlag(ruleArgs[1]));

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java?rev=1344988&r1=1344987&r2=1344988&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java
Fri Jun  1 05:21:28 2012
@@ -19,11 +19,13 @@ package org.apache.lucene.analysis.hunsp
 
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.Version;
+import org.junit.Assert;
 import org.junit.Test;
 
 import java.io.IOException;
 import java.io.InputStream;
 import java.text.ParseException;
+import java.util.Arrays;
 
 import static junit.framework.Assert.assertEquals;
 
@@ -56,4 +58,31 @@ public class HunspellDictionaryTest exte
     affixStream.close();
     dictStream.close();
   }
+
+  @Test
+  public void testHunspellDictionary_loadDicWrongAff() throws IOException, ParseException
{
+    InputStream affixStream = getClass().getResourceAsStream("testWrongAffixRule.aff");
+    InputStream dictStream = getClass().getResourceAsStream("test.dic");
+
+    HunspellDictionary dictionary = new HunspellDictionary(affixStream, Arrays.asList(dictStream),
TEST_VERSION_CURRENT, false, false);
+    assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
+    assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
+    assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size());
+    //strict parsing disabled: malformed rule is not loaded
+    assertNull(dictionary.lookupPrefix(new char[]{'a'}, 0, 1));
+
+    affixStream = getClass().getResourceAsStream("testWrongAffixRule.aff");
+    dictStream = getClass().getResourceAsStream("test.dic");
+    //strict parsing enabled: malformed rule causes ParseException
+    try {
+      dictionary = new HunspellDictionary(affixStream, Arrays.asList(dictStream), TEST_VERSION_CURRENT,
false, true);
+      Assert.fail();
+    } catch(ParseException e) {
+      Assert.assertEquals("The affix file contains a rule with less than five elements",
e.getMessage());
+      Assert.assertEquals(23, e.getErrorOffset());
+    }
+
+    affixStream.close();
+    dictStream.close();
+  }
 }

Modified: lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java?rev=1344988&r1=1344987&r2=1344988&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java
(original)
+++ lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java
Fri Jun  1 05:21:28 2012
@@ -40,7 +40,10 @@ import org.apache.lucene.analysis.util.T
  * Both parameters dictionary and affix are mandatory.
  * <br/>
  * The parameter ignoreCase (true/false) controls whether matching is case sensitive or not.
Default false.
- * <br/> 
+ * <br/>
+ * The parameter strictAffixParsing (true/false) controls whether the affix parsing is strict
or not. Default true.
+ * If strict an error while reading an affix rule causes a ParseException, otherwise is ignored.
+ * <br/>
  * Dictionaries for many languages are available through the OpenOffice project.
  * 
  * See <a href="http://wiki.apache.org/solr/Hunspell">http://wiki.apache.org/solr/Hunspell</a>
@@ -50,6 +53,7 @@ public class HunspellStemFilterFactory e
   private static final String PARAM_DICTIONARY = "dictionary";
   private static final String PARAM_AFFIX = "affix";
   private static final String PARAM_IGNORE_CASE = "ignoreCase";
+  private static final String PARAM_STRICT_AFFIX_PARSING = "strictAffixParsing";
   private static final String TRUE = "true";
   private static final String FALSE = "false";
   
@@ -72,12 +76,21 @@ public class HunspellStemFilterFactory e
       else throw new InitializationException("Unknown value for " + PARAM_IGNORE_CASE + ":
" + pic + ". Must be true or false");
     }
 
+
+    String strictAffixParsingParam = args.get(PARAM_STRICT_AFFIX_PARSING);
+    boolean strictAffixParsing = true;
+    if(strictAffixParsingParam != null) {
+      if(strictAffixParsingParam.equalsIgnoreCase(FALSE)) strictAffixParsing = false;
+      else if(strictAffixParsingParam.equalsIgnoreCase(TRUE)) strictAffixParsing = true;
+      else throw new InitializationException("Unknown value for " + PARAM_STRICT_AFFIX_PARSING
+ ": " + strictAffixParsingParam + ". Must be true or false");
+    }
+
     try {
       List<InputStream> dictionaries = new ArrayList<InputStream>();
       for (String file : dictionaryFiles) {
         dictionaries.add(loader.openResource(file));
       }
-      this.dictionary = new HunspellDictionary(loader.openResource(affixFile), dictionaries,
luceneMatchVersion, ignoreCase);
+      this.dictionary = new HunspellDictionary(loader.openResource(affixFile), dictionaries,
luceneMatchVersion, ignoreCase, strictAffixParsing);
     } catch (Exception e) {
       throw new InitializationException("Unable to load hunspell data! [dictionary=" + args.get("dictionary")
+ ",affix=" + affixFile + "]", e);
     }



Mime
View raw message