lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From o...@apache.org
Subject svn commit: r669085 - in /lucene/java/trunk: ./ contrib/spellchecker/src/java/org/apache/lucene/search/spell/ contrib/spellchecker/src/test/org/apache/lucene/search/spell/
Date Wed, 18 Jun 2008 05:01:58 GMT
Author: otis
Date: Tue Jun 17 22:01:57 2008
New Revision: 669085

URL: http://svn.apache.org/viewvc?rev=669085&view=rev
Log:
LUCENE-1297 - Allow other string distance measures for the SpellChecker

Added:
    lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/JaroWinklerDistance.java
    lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/StringDistance.java
    lucene/java/trunk/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestJaroWinklerDistance.java
    lucene/java/trunk/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java
Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java
    lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/TRStringDistance.java
    lucene/java/trunk/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=669085&r1=669084&r2=669085&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Tue Jun 17 22:01:57 2008
@@ -186,6 +186,9 @@
 
 16. LUCENE-1298: MoreLikeThis can now accept a custom Similarity (Grant Ingersoll)
 
+17. LUCENE-1297: Allow other string distance measures for the SpellChecker
+    (Thomas Morton via Otis Gospodnetic)
+
 Optimizations
 
  1. LUCENE-705: When building a compound file, use

Added: lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/JaroWinklerDistance.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/JaroWinklerDistance.java?rev=669085&view=auto
==============================================================================
--- lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/JaroWinklerDistance.java
(added)
+++ lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/JaroWinklerDistance.java
Tue Jun 17 22:01:57 2008
@@ -0,0 +1,112 @@
+package org.apache.lucene.search.spell;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Arrays;
+
+public class JaroWinklerDistance implements StringDistance {
+
+  private float threshold = 0.7f;
+
+  private int[] matches(String s1, String s2) {
+    String max, min;
+    if (s1.length() > s2.length()) {
+      max = s1;
+      min = s2;
+    } else {
+      max = s2;
+      min = s1;
+    }
+    int range = Math.max(max.length() / 2 - 1, 0);
+    int[] matchIndexes = new int[min.length()];
+    Arrays.fill(matchIndexes, -1);
+    boolean[] matchFlags = new boolean[max.length()];
+    int matches = 0;
+    for (int mi = 0; mi < min.length(); mi++) {
+      char c1 = min.charAt(mi);
+      for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 1, max
+          .length()); xi < xn; xi++) {
+        if (!matchFlags[xi] && c1 == max.charAt(xi)) {
+          matchIndexes[mi] = xi;
+          matchFlags[xi] = true;
+          matches++;
+          break;
+        }
+      }
+    }
+    char[] ms1 = new char[matches];
+    char[] ms2 = new char[matches];
+    for (int i = 0, si = 0; i < min.length(); i++) {
+      if (matchIndexes[i] != -1) {
+        ms1[si] = min.charAt(i);
+        si++;
+      }
+    }
+    for (int i = 0, si = 0; i < max.length(); i++) {
+      if (matchFlags[i]) {
+        ms2[si] = max.charAt(i);
+        si++;
+      }
+    }
+    int transpositions = 0;
+    for (int mi = 0; mi < ms1.length; mi++) {
+      if (ms1[mi] != ms2[mi]) {
+        transpositions++;
+      }
+    }
+    int prefix = 0;
+    for (int mi = 0; mi < min.length(); mi++) {
+      if (s1.charAt(mi) == s2.charAt(mi)) {
+        prefix++;
+      } else {
+        break;
+      }
+    }
+    return new int[] { matches, transpositions / 2, prefix, max.length() };
+  }
+
+  public float getDistance(String s1, String s2) {
+    int[] mtp = matches(s1, s2);
+    float m = (float) mtp[0];
+    if (m == 0) {
+      return 0f;
+    }
+    float j = ((m / s1.length() + m / s2.length() + (m - mtp[1]) / m)) / 3;
+    float jw = j < getThreshold() ? j : j + Math.min(0.1f, 1f / mtp[3]) * mtp[2]
+        * (1 - j);
+    return jw;
+  }
+
+  /**
+   * Sets the threshold used to determine when Winkler bonus should be used.
+   * Set to a negative value to get the Jaro distance.
+   * @param threshold the new value of the threshold
+   */
+  public void setThreshold(float threshold) {
+    this.threshold = threshold;
+  }
+
+  /**
+   * Returns the current value of the threshold used for adding the Winkler bonus.
+   * The deafult value is 0.7.
+   * @return the current value of the threshold
+   */
+  public float getThreshold() {
+    return threshold;
+  }
+}

Modified: lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java?rev=669085&r1=669084&r2=669085&view=diff
==============================================================================
--- lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java
(original)
+++ lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java
Tue Jun 17 22:01:57 2008
@@ -76,6 +76,8 @@
 
   // minimum score for hits generated by the spell checker query
   private float minScore = 0.5f;
+  
+  private StringDistance sd;
 
   /**
    * Use the given directory as a spell checker index. The directory
@@ -84,10 +86,15 @@
    * @param spellIndex
    * @throws IOException
    */
-  public SpellChecker(Directory spellIndex) throws IOException {
+  public SpellChecker(Directory spellIndex,StringDistance sd) throws IOException {
     this.setSpellIndex(spellIndex);
+    this.setStringDistance(sd);
   }
 
+  public SpellChecker(Directory spellIndex) throws IOException {
+    this(spellIndex,new TRStringDistance());
+  }
+  
   /**
    * Use a different index as the spell checker index or re-open
    * the existing index if <code>spellIndex</code> is the same value
@@ -108,6 +115,11 @@
     }
     searcher = new IndexSearcher(this.spellIndex);
   }
+  
+  public void setStringDistance(StringDistance sd) {
+    this.sd = sd;
+  }
+  
 
   /**
    * Sets the accuracy 0 &lt; minScore &lt; 1; default 0.5
@@ -163,7 +175,6 @@
       String field, boolean morePopular) throws IOException {
 
     float min = this.minScore;
-    final TRStringDistance sd = new TRStringDistance(word);
     final int lengthWord = word.length();
 
     final int freq = (ir != null && field != null) ? ir.docFreq(new Term(field, word))
: 0;
@@ -217,9 +228,8 @@
         continue;
       }
 
-      // edit distance/normalize with the minScore word length
-      sugWord.score = 1.0f - ((float) sd.getDistance(sugWord.string) / Math
-          .min(sugWord.string.length(), lengthWord));
+      // edit distance
+      sugWord.score = sd.getDistance(word,sugWord.string);
       if (sugWord.score < min) {
         continue;
       }

Added: lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/StringDistance.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/StringDistance.java?rev=669085&view=auto
==============================================================================
--- lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/StringDistance.java
(added)
+++ lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/StringDistance.java
Tue Jun 17 22:01:57 2008
@@ -0,0 +1,35 @@
+package org.apache.lucene.search.spell;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Interface for string distances.
+ */
+public interface StringDistance {
+
+  /**
+   * Returns a float between 0 and 1 based on how similar the specified strings are to one
another.  
+   * Returning a value of 0 means the specified strings are identical and 1 means the
+   * string are maximally different.
+   * @param s1 The first string.
+   * @param s2 The second string.
+   * @return a float between 0 and 1 based on how similar the specified strings are to one
another.
+   */
+  public float getDistance(String s1,String s2);
+  
+}

Modified: lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/TRStringDistance.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/TRStringDistance.java?rev=669085&r1=669084&r2=669085&view=diff
==============================================================================
--- lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/TRStringDistance.java
(original)
+++ lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/TRStringDistance.java
Tue Jun 17 22:01:57 2008
@@ -18,34 +18,28 @@
  */
 
 /**
- * Edit distance class.
- * Note: this class is not thread-safe.
+ * Levenshtein edit distance class.
  */
-final class TRStringDistance {
-
-    final char[] sa;
-    final int n;
-    int p[]; //'previous' cost array, horizontally
-    int d[]; // cost array, horizontally
-    int _d[]; //placeholder to assist in swapping p and d
-
+final class TRStringDistance implements StringDistance {
 
     /**
      * Optimized to run a bit faster than the static getDistance().
      * In one benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus 37%
faster.
      */
-    public TRStringDistance (String target) {
-        sa = target.toCharArray();
-        n = sa.length;
-        p = new int[n+1]; //'previous' cost array, horizontally
-        d = new int[n+1]; // cost array, horizontally
+    public TRStringDistance () {
     }
 
 
     //*****************************
     // Compute Levenshtein distance: see org.apache.commons.lang.StringUtils#getLevenshteinDistance(String,
String)
     //*****************************
-    public final int getDistance (String other) {
+    public float getDistance (String target, String other) {
+      char[] sa;
+      int n;
+      int p[]; //'previous' cost array, horizontally
+      int d[]; // cost array, horizontally
+      int _d[]; //placeholder to assist in swapping p and d
+      
         /*
            The difference between this impl. and the previous is that, rather
            than creating and retaining a matrix of size s.length()+1 by t.length()+1,
@@ -63,12 +57,17 @@
            cause an out of memory condition when calculating the LD over two very large strings.
          */
 
+        sa = target.toCharArray();
+        n = sa.length;
+        p = new int[n+1]; 
+        d = new int[n+1]; 
+      
         final int m = other.length();
 
         if (n == 0) {
-            return m;
+            return 1;
         } else if (m == 0) {
-            return n;
+            return 1;
         }
 
         // indexes into strings s and t
@@ -101,7 +100,7 @@
 
         // our last action in the above loop was to switch d and p, so p now
         // actually has the most recent cost counts
-        return p[n];
+        return 1.0f - ((float) p[n] / Math.min(other.length(), sa.length));
     }
 
 }

Added: lucene/java/trunk/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestJaroWinklerDistance.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestJaroWinklerDistance.java?rev=669085&view=auto
==============================================================================
--- lucene/java/trunk/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestJaroWinklerDistance.java
(added)
+++ lucene/java/trunk/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestJaroWinklerDistance.java
Tue Jun 17 22:01:57 2008
@@ -0,0 +1,49 @@
+package org.apache.lucene.search.spell;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import junit.framework.TestCase;
+
+public class TestJaroWinklerDistance extends TestCase {
+
+  private StringDistance sd = new JaroWinklerDistance();
+  
+  public void testGetDistance() {
+    float d = sd.getDistance("al", "al");
+    assertTrue(d == 1.0f);
+    d = sd.getDistance("martha", "marhta");
+    assertTrue(d > 0.961 && d <0.962);
+    d = sd.getDistance("jones", "johnson");
+    assertTrue(d > 0.832 && d < 0.833);
+    d = sd.getDistance("abcvwxyz", "cabvwxyz");
+    assertTrue(d > 0.958 && d < 0.959);
+    d = sd.getDistance("dwayne", "duane");
+    assertTrue(d > 0.84 && d < 0.841);
+    d = sd.getDistance("dixon", "dicksonx");
+    assertTrue(d > 0.813 && d < 0.814);
+    d = sd.getDistance("fvie", "ten");
+    assertTrue(d == 0f);
+    float d1 = sd.getDistance("zac ephron", "zac efron");
+    float d2 = sd.getDistance("zac ephron", "kai ephron");
+    assertTrue(d1 > d2);
+    d1 = sd.getDistance("brittney spears", "britney spears");
+    d2 = sd.getDistance("brittney spears", "brittney startzman");
+    assertTrue(d1 > d2);    
+  }
+
+}
\ No newline at end of file

Added: lucene/java/trunk/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java?rev=669085&view=auto
==============================================================================
--- lucene/java/trunk/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java
(added)
+++ lucene/java/trunk/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java
Tue Jun 17 22:01:57 2008
@@ -0,0 +1,49 @@
+package org.apache.lucene.search.spell;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import junit.framework.TestCase;
+
+public class TestLevenshteinDistance extends TestCase {
+
+  private StringDistance sd = new TRStringDistance();
+  
+  public void testGetDistance() {
+    float d = sd.getDistance("al", "al");
+    assertTrue(d == 1.0f);
+    d = sd.getDistance("martha", "marhta");
+    assertTrue(d > 0.66 && d <0.67);
+    d = sd.getDistance("jones", "johnson");
+    assertTrue(d > 0.199 && d < 0.201);
+    d = sd.getDistance("abcvwxyz", "cabvwxyz");
+    assertTrue(d > 0.749 && d < 0.751);
+    d = sd.getDistance("dwayne", "duane");
+    assertTrue(d > 0.599 && d < 0.601);
+    d = sd.getDistance("dixon", "dicksonx");
+    assertTrue(d > 0.199 && d < 0.201);
+    d = sd.getDistance("six", "ten");
+    assertTrue(d == 0f);
+    float d1 = sd.getDistance("zac ephron", "zac efron");
+    float d2 = sd.getDistance("zac ephron", "kai ephron");
+    assertTrue(d1 < d2);
+    d1 = sd.getDistance("brittney spears", "britney spears");
+    d2 = sd.getDistance("brittney spears", "brittney startzman");
+    assertTrue(d1 > d2);    
+  }
+
+}
\ No newline at end of file

Modified: lucene/java/trunk/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java?rev=669085&r1=669084&r2=669085&view=diff
==============================================================================
--- lucene/java/trunk/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java
(original)
+++ lucene/java/trunk/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java
Tue Jun 17 22:01:57 2008
@@ -43,7 +43,7 @@
 
   protected void setUp() throws Exception {
     super.setUp();
-
+    
     //create a user index
     userindex = new RAMDirectory();
     IndexWriter writer = new IndexWriter(userindex, new SimpleAnalyzer(), true);
@@ -75,6 +75,46 @@
 
     assertEquals(num_field2, num_field1 + 1);
 
+    checkCommonSuggestions(r);
+    checkLevenshteinSuggestions(r);
+    
+    spellChecker.setStringDistance(new JaroWinklerDistance());
+    spellChecker.setAccuracy(0.8f);
+    checkCommonSuggestions(r);
+    checkJaroWinklerSuggestions();
+  }
+
+  private void checkCommonSuggestions(IndexReader r) throws IOException {
+    String[] similar = spellChecker.suggestSimilar("fvie", 2);
+    assertTrue(similar.length > 0);
+    assertEquals(similar[0], "five");
+    
+    similar = spellChecker.suggestSimilar("five", 2);
+    if (similar.length > 0) {
+      assertFalse(similar[0].equals("five")); // don't suggest a word for itself
+    }
+    
+    similar = spellChecker.suggestSimilar("fiv", 2);
+    assertTrue(similar.length > 0);
+    assertEquals(similar[0], "five");
+    
+    similar = spellChecker.suggestSimilar("fives", 2);
+    assertTrue(similar.length > 0);
+    assertEquals(similar[0], "five");
+    
+    assertTrue(similar.length > 0);
+    similar = spellChecker.suggestSimilar("fie", 2);
+    assertEquals(similar[0], "five");
+    
+    //  test restraint to a field
+    similar = spellChecker.suggestSimilar("tousand", 10, r, "field1", false);
+    assertEquals(0, similar.length); // there isn't the term thousand in the field field1
+
+    similar = spellChecker.suggestSimilar("tousand", 10, r, "field2", false);
+    assertEquals(1, similar.length); // there is the term thousand in the field field2
+  }
+
+  private void checkLevenshteinSuggestions(IndexReader r) throws IOException {
     // test small word
     String[] similar = spellChecker.suggestSimilar("fvie", 2);
     assertEquals(1, similar.length);
@@ -109,14 +149,21 @@
 
     similar = spellChecker.suggestSimilar("tousand", 10, r, "field2", false);
     assertEquals(1, similar.length); // there is the term thousand in the field field2
-
+    
+    similar = spellChecker.suggestSimilar("onety", 2);
+    assertEquals(1, similar.length);
+    assertEquals(similar[0], "ninety");
     try {
       similar = spellChecker.suggestSimilar("tousand", 10, r, null, false);
     } catch (NullPointerException e) {
       assertTrue("threw an NPE, and it shouldn't have", false);
     }
+  }
 
-
+  private void checkJaroWinklerSuggestions() throws IOException {
+    String[] similar = spellChecker.suggestSimilar("onety", 2);
+    assertEquals(2, similar.length);
+    assertEquals(similar[0], "one");
   }
 
   private void addwords(IndexReader r, String field) throws IOException {



Mime
View raw message