lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r1572666 - in /lucene/dev/branches/lucene5468/lucene/analysis/common/src: java/org/apache/lucene/analysis/hunspell2/ test/org/apache/lucene/analysis/hunspell2/
Date Thu, 27 Feb 2014 17:53:30 GMT
Author: rmuir
Date: Thu Feb 27 17:53:30 2014
New Revision: 1572666

URL: http://svn.apache.org/r1572666
Log:
LUCENE-5468: convert affixes to FST

Modified:
    lucene/dev/branches/lucene5468/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
    lucene/dev/branches/lucene5468/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
    lucene/dev/branches/lucene5468/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java
    lucene/dev/branches/lucene5468/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java

Modified: lucene/dev/branches/lucene5468/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5468/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java?rev=1572666&r1=1572665&r2=1572666&view=diff
==============================================================================
--- lucene/dev/branches/lucene5468/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
(original)
+++ lucene/dev/branches/lucene5468/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
Thu Feb 27 17:53:30 2014
@@ -31,7 +31,9 @@ import org.apache.lucene.util.UnicodeUti
 import org.apache.lucene.util.Version;
 import org.apache.lucene.util.fst.Builder;
 import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.IntSequenceOutputs;
 import org.apache.lucene.util.fst.PositiveIntOutputs;
+import org.apache.lucene.util.fst.Util;
 
 import java.io.*;
 import java.nio.charset.Charset;
@@ -46,6 +48,7 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
+import java.util.TreeMap;
 import java.util.regex.Pattern;
 
 /**
@@ -68,8 +71,8 @@ public class Dictionary {
   private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
   private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
 
-  public CharArrayMap<List<Character>> prefixes;
-  public CharArrayMap<List<Character>> suffixes;
+  public FST<IntsRef> prefixes;
+  public FST<IntsRef> suffixes;
   
   // all Patterns used by prefixes and suffixes. these are typically re-used across
   // many affix stripping rules. so these are deduplicated, to save RAM.
@@ -137,7 +140,7 @@ public class Dictionary {
       ord = lookupOrd(word, offset, length);
     } catch (IOException ex) { /* bogus */ }
     if (ord == null) {
-      return null;
+      return null;  
     }
     return decodeFlags(flagLookup.get(ord, scratch));
   }
@@ -175,8 +178,8 @@ public class Dictionary {
    * @param length Length from the offset that the String is
    * @return List of HunspellAffix prefixes with an append that matches the String, or {@code
null} if none are found
    */
-  public List<Character> lookupPrefix(char word[], int offset, int length) {
-    return prefixes.get(word, offset, length);
+  IntsRef lookupPrefix(char word[], int offset, int length) {
+    return lookupAffix(prefixes, word, offset, length);
   }
 
   /**
@@ -187,8 +190,42 @@ public class Dictionary {
    * @param length Length from the offset that the String is
    * @return List of HunspellAffix suffixes with an append that matches the String, or {@code
null} if none are found
    */
-  List<Character> lookupSuffix(char word[], int offset, int length) {
-    return suffixes.get(word, offset, length);
+  IntsRef lookupSuffix(char word[], int offset, int length) {
+    return lookupAffix(suffixes, word, offset, length);
+  }
+  
+  // TODO: this is pretty stupid, considering how the stemming algorithm works
+  // we can speed it up to be significantly faster!
+  IntsRef lookupAffix(FST<IntsRef> fst, char word[], int offset, int length) {
+    if (fst == null) {
+      return null;
+    }
+    final FST.BytesReader bytesReader = fst.getBytesReader();
+    final FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<IntsRef>());
+    // Accumulate output as we go
+    final IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
+    IntsRef output = NO_OUTPUT;
+    
+    int l = offset + length;
+    try {
+      for (int i = offset, cp = 0; i < l; i += Character.charCount(cp)) {
+        cp = Character.codePointAt(word, i, l);
+        if (fst.findTargetArc(cp, arc, arc, bytesReader) == null) {
+          return null;
+        } else if (arc.output != NO_OUTPUT) {
+          output = fst.outputs.add(output, arc.output);
+        }
+      }
+      if (fst.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) {
+        return null;
+      } else if (arc.output != NO_OUTPUT) {
+        return fst.outputs.add(output, arc.output);
+      } else {
+        return output;
+      }
+    } catch (IOException bogus) {
+      throw new RuntimeException(bogus);
+    }
   }
 
   /**
@@ -199,8 +236,8 @@ public class Dictionary {
    * @throws IOException Can be thrown while reading from the InputStream
    */
   private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException,
ParseException {
-    prefixes = new CharArrayMap<List<Character>>(Version.LUCENE_CURRENT, 8, false);
-    suffixes = new CharArrayMap<List<Character>>(Version.LUCENE_CURRENT, 8, false);
+    TreeMap<String, List<Character>> prefixes = new TreeMap<>();
+    TreeMap<String, List<Character>> suffixes = new TreeMap<>();
     Map<String,Integer> seenPatterns = new HashMap<>();
 
     LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
@@ -218,6 +255,27 @@ public class Dictionary {
         flagParsingStrategy = getFlagParsingStrategy(line);
       }
     }
+    
+    this.prefixes = affixFST(prefixes);
+    this.suffixes = affixFST(suffixes);
+  }
+  
+  private FST<IntsRef> affixFST(TreeMap<String,List<Character>> affixes)
throws IOException {
+    IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
+    Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
+    
+    IntsRef scratch = new IntsRef();
+    for (Map.Entry<String,List<Character>> entry : affixes.entrySet()) {
+      Util.toUTF32(entry.getKey(), scratch);
+      List<Character> entries = entry.getValue();
+      IntsRef output = new IntsRef(entries.size());
+      int upto = 0;
+      for (Character c : entries) {
+        output.ints[output.length++] = c;
+      }
+      builder.add(scratch, output);
+    }
+    return builder.finish();
   }
 
   /**
@@ -231,7 +289,7 @@ public class Dictionary {
    * @param seenPatterns map from condition -> index of patterns, for deduplication.
    * @throws IOException Can be thrown while reading the rule
    */
-  private void parseAffix(CharArrayMap<List<Character>> affixes,
+  private void parseAffix(TreeMap<String,List<Character>> affixes,
                           String header,
                           LineNumberReader reader,
                           String conditionPattern,

Modified: lucene/dev/branches/lucene5468/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5468/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java?rev=1572666&r1=1572665&r2=1572666&view=diff
==============================================================================
--- lucene/dev/branches/lucene5468/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
(original)
+++ lucene/dev/branches/lucene5468/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
Thu Feb 27 17:53:30 2014
@@ -27,6 +27,7 @@ import org.apache.lucene.analysis.util.C
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.Version;
 
 /**
@@ -125,12 +126,13 @@ final class Stemmer {
     List<CharsRef> stems = new ArrayList<CharsRef>();
 
     for (int i = 0; i < length; i++) {
-      List<Character> suffixes = dictionary.lookupSuffix(word, i, length - i);
+      IntsRef suffixes = dictionary.lookupSuffix(word, i, length - i);
       if (suffixes == null) {
         continue;
       }
 
-      for (Character suffix : suffixes) {
+      for (int j = 0; j < suffixes.length; j++) {
+        int suffix = suffixes.ints[suffixes.offset + j];
         affixReader.setPosition(8 * suffix);
         char flag = (char) (affixReader.readShort() & 0xffff);
         if (hasCrossCheckedFlag(flag, flags)) {
@@ -149,12 +151,13 @@ final class Stemmer {
     }
 
     for (int i = length - 1; i >= 0; i--) {
-      List<Character> prefixes = dictionary.lookupPrefix(word, 0, i);
+      IntsRef prefixes = dictionary.lookupPrefix(word, 0, i);
       if (prefixes == null) {
         continue;
       }
 
-      for (Character prefix : prefixes) {
+      for (int j = 0; j < prefixes.length; j++) {
+        int prefix = prefixes.ints[prefixes.offset + j];
         affixReader.setPosition(8 * prefix);
         char flag = (char) (affixReader.readShort() & 0xffff);
         if (hasCrossCheckedFlag(flag, flags)) {
@@ -185,7 +188,7 @@ final class Stemmer {
    * @param recursionDepth Level of recursion this stemming step is at
    * @return List of stems for the word, or an empty list if none are found
    */
-  public List<CharsRef> applyAffix(char strippedWord[], int length, char affix, int
recursionDepth) {
+  public List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int
recursionDepth) {
     segment.setLength(0);
     segment.append(strippedWord, 0, length);
     

Modified: lucene/dev/branches/lucene5468/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5468/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java?rev=1572666&r1=1572665&r2=1572666&view=diff
==============================================================================
--- lucene/dev/branches/lucene5468/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java
(original)
+++ lucene/dev/branches/lucene5468/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java
Thu Feb 27 17:53:30 2014
@@ -179,6 +179,9 @@ public class TestAllDictionaries extends
           System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic)
+ "\t(" +
                              "words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " +
                              "flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ",
" +
+                             "strips=" + RamUsageEstimator.humanSizeOf(dic.stripLookup) +
", " +
+                             "conditions=" + RamUsageEstimator.humanSizeOf(dic.patterns)
+ ", " +
+                             "affixData=" + RamUsageEstimator.humanSizeOf(dic.affixData)
+ ", " +
                              "prefixes=" + RamUsageEstimator.humanSizeOf(dic.prefixes) +
", " +
                              "suffixes=" + RamUsageEstimator.humanSizeOf(dic.suffixes) +
")");
         }

Modified: lucene/dev/branches/lucene5468/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5468/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java?rev=1572666&r1=1572665&r2=1572666&view=diff
==============================================================================
--- lucene/dev/branches/lucene5468/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java
(original)
+++ lucene/dev/branches/lucene5468/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java
Thu Feb 27 17:53:30 2014
@@ -32,8 +32,8 @@ public class TestDictionary extends Luce
     InputStream dictStream = getClass().getResourceAsStream("simple.dic");
 
     Dictionary dictionary = new Dictionary(affixStream, dictStream);
-    assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
-    assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
+    assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).length);
+    assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).length);
     char flags[] = dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3, new BytesRef());
     assertNotNull(flags);
     assertEquals(1, flags.length);
@@ -48,8 +48,8 @@ public class TestDictionary extends Luce
     InputStream dictStream = getClass().getResourceAsStream("compressed.dic");
 
     Dictionary dictionary = new Dictionary(affixStream, dictStream);
-    assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
-    assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
+    assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).length);
+    assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).length);
     assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3, new BytesRef()).length);
     
     affixStream.close();



Mime
View raw message