lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From gsing...@apache.org
Subject svn commit: r776704 - in /lucene/java/trunk: ./ contrib/spellchecker/src/java/org/apache/lucene/search/spell/ contrib/spellchecker/src/test/org/apache/lucene/search/spell/
Date Wed, 20 May 2009 14:07:09 GMT
Author: gsingers
Date: Wed May 20 14:07:08 2009
New Revision: 776704

URL: http://svn.apache.org/viewvc?rev=776704&view=rev
Log:
LUCENE-1550: Added new ngram spell checking distance

Added:
    lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/NGramDistance.java
    lucene/java/trunk/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestNGramDistance.java
Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LevensteinDistance.java
    lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/StringDistance.java
    lucene/java/trunk/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java
    lucene/java/trunk/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=776704&r1=776703&r2=776704&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Wed May 20 14:07:08 2009
@@ -311,6 +311,10 @@
 25. LUCENE-1634: Add calibrateSizeByDeletes to LogMergePolicy, to take
     deletions into account when considering merges.  (Yasuhiro Matsuda
     via Mike McCandless)
+
+26. LUCENE-1550: Added new n-gram based String distance measure for spell checking.
+    See the Javadocs for NGramDistance.java for a reference paper on why this is helpful
(Tom Morton via Grant Ingersoll)
+        
     
 Optimizations
 

Modified: lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LevensteinDistance.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LevensteinDistance.java?rev=776704&r1=776703&r2=776704&view=diff
==============================================================================
--- lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LevensteinDistance.java
(original)
+++ lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LevensteinDistance.java
Wed May 20 14:07:08 2009
@@ -63,12 +63,15 @@
         d = new int[n+1]; 
       
         final int m = other.length();
-
-        if (n == 0) {
-            return 1;
-        } else if (m == 0) {
+        if (n == 0 || m == 0) {
+          if (n == m) {
             return 1;
-        }
+          }
+          else {
+            return 0;
+          }
+        } 
+
 
         // indexes into strings s and t
         int i; // iterates through s

Added: lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/NGramDistance.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/NGramDistance.java?rev=776704&view=auto
==============================================================================
--- lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/NGramDistance.java
(added)
+++ lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/NGramDistance.java
Wed May 20 14:07:08 2009
@@ -0,0 +1,144 @@
+package org.apache.lucene.search.spell;
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/**
+ * N-Gram version of edit distance based on paper by Grzegorz Kondrak, 
+ * "N-gram similarity and distance". Proceedings of the Twelfth International 
+ * Conference on String Processing and Information Retrieval (SPIRE 2005), pp. 115-126, 
+ * Buenos Aires, Argentina, November 2005. 
+ * http://www.cs.ualberta.ca/~kondrak/papers/spire05.pdf
+ * 
+ * This implementation uses the position-based optimization to compute partial
+ * matches of n-gram sub-strings and adds a null-character prefix of size n-1 
+ * so that the first character is contained in the same number of n-grams as 
+ * a middle character.  Null-character prefix matches are discounted so that 
+ * strings with no matching characters will return a distance of 0.
+ * 
+ */
+public class NGramDistance implements StringDistance {
+
+  private int n;
+  
+  /**
+   * Creates an N-Gram distance measure using n-grams of the specified size.
+   * @param size The size of the n-gram to be used to compute the string distance.
+   */
+  public NGramDistance(int size) {
+    this.n = size;
+  }
+  
+  /**
+   * Creates an N-Gram distance measure using n-grams of size 2.
+   */
+  public NGramDistance() {
+    this(2);
+  }
+  
+  public float getDistance(String source, String target) {
+    final int sl = source.length();
+    final int tl = target.length();
+    
+    if (sl == 0 || tl == 0) {
+      if (sl == tl) {
+        return 1;
+      }
+      else {
+        return 0;
+      }
+    }
+
+    int cost = 0;
+    if (sl < n || tl < n) {
+      for (int i=0,ni=Math.min(sl,tl);i<ni;i++) {
+        if (source.charAt(i) == target.charAt(i)) {
+          cost++;
+        }
+      }
+      return (float) cost/Math.max(sl, tl);
+    }
+
+    char[] sa = new char[sl+n-1];
+    float p[]; //'previous' cost array, horizontally
+    float d[]; // cost array, horizontally
+    float _d[]; //placeholder to assist in swapping p and d
+    
+    //construct sa with prefix
+    for (int i=0;i<sa.length;i++) {
+      if (i < n-1) {
+        sa[i]=0; //add prefix
+      }
+      else {
+        sa[i] = source.charAt(i-n+1);
+      }
+    }
+    p = new float[sl+1]; 
+    d = new float[sl+1]; 
+  
+    // indexes into strings s and t
+    int i; // iterates through source
+    int j; // iterates through target
+
+    char[] t_j = new char[n]; // jth n-gram of t
+
+    for (i = 0; i<=sl; i++) {
+        p[i] = i;
+    }
+
+    for (j = 1; j<=tl; j++) {
+        //construct t_j n-gram 
+        if (j < n) {
+          for (int ti=0;ti<n-j;ti++) {
+            t_j[ti]=0; //add prefix
+          }
+          for (int ti=n-j;ti<n;ti++) {
+            t_j[ti]=target.charAt(ti-(n-j));
+          }
+        }
+        else {
+          t_j = target.substring(j-n, j).toCharArray();
+        }
+        d[0] = j;
+        for (i=1; i<=sl; i++) {
+            cost = 0;
+            int tn=n;
+            //compare sa to t_j
+            for (int ni=0;ni<n;ni++) {
+              if (sa[i-1+ni] != t_j[ni]) {
+                cost++;
+              }
+              else if (sa[i-1+ni] == 0) { //discount matches on prefix
+                tn--;
+              }
+            }
+            float ec = (float) cost/tn;
+            // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
+            d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1),  p[i-1]+ec);
+        }
+        // copy current distance counts to 'previous row' distance counts
+        _d = p;
+        p = d;
+        d = _d;
+    }
+
+    // our last action in the above loop was to switch d and p, so p now
+    // actually has the most recent cost counts
+    return 1.0f - ((float) p[sl] / Math.max(tl, sl));
+  }
+
+}

Modified: lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/StringDistance.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/StringDistance.java?rev=776704&r1=776703&r2=776704&view=diff
==============================================================================
--- lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/StringDistance.java
(original)
+++ lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/StringDistance.java
Wed May 20 14:07:08 2009
@@ -24,7 +24,7 @@
 
   /**
    * Returns a float between 0 and 1 based on how similar the specified strings are to one
another.  
-   * Returning a value of 0 means the specified strings are identical and 1 means the
+   * Returning a value of 1 means the specified strings are identical and 0 means the
    * string are maximally different.
    * @param s1 The first string.
    * @param s2 The second string.

Modified: lucene/java/trunk/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java?rev=776704&r1=776703&r2=776704&view=diff
==============================================================================
--- lucene/java/trunk/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java
(original)
+++ lucene/java/trunk/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java
Wed May 20 14:07:08 2009
@@ -45,5 +45,10 @@
     d2 = sd.getDistance("brittney spears", "brittney startzman");
     assertTrue(d1 > d2);
   }
+  
+  public void testEmpty() throws Exception {
+    float d = sd.getDistance("", "al");
+    assertEquals(d,0.0f,0.001);
+  }
 
 }

Added: lucene/java/trunk/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestNGramDistance.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestNGramDistance.java?rev=776704&view=auto
==============================================================================
--- lucene/java/trunk/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestNGramDistance.java
(added)
+++ lucene/java/trunk/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestNGramDistance.java
Wed May 20 14:07:08 2009
@@ -0,0 +1,132 @@
+package org.apache.lucene.search.spell;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import junit.framework.TestCase;
+
+public class TestNGramDistance extends TestCase {
+
+  
+  
+  public void testGetDistance1() {
+    StringDistance nsd = new NGramDistance(1);
+    float d = nsd.getDistance("al", "al");
+    assertEquals(d,1.0f,0.001);
+    d = nsd.getDistance("a", "a");
+    assertEquals(d,1.0f,0.001);
+    d = nsd.getDistance("b", "a");
+    assertEquals(d,0.0f,0.001);
+    d = nsd.getDistance("martha", "marhta");
+    assertEquals(d,0.6666,0.001);
+    d = nsd.getDistance("jones", "johnson");
+    assertEquals(d,0.4285,0.001);
+    d = nsd.getDistance("natural", "contrary");
+    assertEquals(d,0.25,0.001);
+    d = nsd.getDistance("abcvwxyz", "cabvwxyz");
+    assertEquals(d,0.75,0.001);    
+    d = nsd.getDistance("dwayne", "duane");
+    assertEquals(d,0.666,0.001);
+    d = nsd.getDistance("dixon", "dicksonx");
+    assertEquals(d,0.5,0.001);
+    d = nsd.getDistance("six", "ten");
+    assertEquals(d,0,0.001);
+    float d1 = nsd.getDistance("zac ephron", "zac efron");
+    float d2 = nsd.getDistance("zac ephron", "kai ephron");
+    assertEquals(d1,d2,0.001);
+    d1 = nsd.getDistance("brittney spears", "britney spears");
+    d2 = nsd.getDistance("brittney spears", "brittney startzman");
+    assertTrue(d1 > d2);
+    d1 = nsd.getDistance("12345678", "12890678");
+    d2 = nsd.getDistance("12345678", "72385698");
+    assertEquals(d1,d2,001);
+  }
+  
+  public void testGetDistance2() {
+    StringDistance sd = new NGramDistance(2);
+    float d = sd.getDistance("al", "al");
+    assertEquals(d,1.0f,0.001);
+    d = sd.getDistance("a", "a");
+    assertEquals(d,1.0f,0.001);
+    d = sd.getDistance("b", "a");
+    assertEquals(d,0.0f,0.001);
+    d = sd.getDistance("a", "aa");
+    assertEquals(d,0.5f,0.001);
+    d = sd.getDistance("martha", "marhta");
+    assertEquals(d,0.6666,0.001);
+    d = sd.getDistance("jones", "johnson");
+    assertEquals(d,0.4285,0.001);
+    d = sd.getDistance("natural", "contrary");
+    assertEquals(d,0.25,0.001);
+    d = sd.getDistance("abcvwxyz", "cabvwxyz");
+    assertEquals(d,0.625,0.001);    
+    d = sd.getDistance("dwayne", "duane");
+    assertEquals(d,0.5833,0.001);
+    d = sd.getDistance("dixon", "dicksonx");
+    assertEquals(d,0.5,0.001);
+    d = sd.getDistance("six", "ten");
+    assertEquals(d,0,0.001);
+    float d1 = sd.getDistance("zac ephron", "zac efron");
+    float d2 = sd.getDistance("zac ephron", "kai ephron");
+    assertTrue(d1 > d2);
+    d1 = sd.getDistance("brittney spears", "britney spears");
+    d2 = sd.getDistance("brittney spears", "brittney startzman");
+    assertTrue(d1 > d2);
+    d1 = sd.getDistance("0012345678", "0012890678");
+    d2 = sd.getDistance("0012345678", "0072385698");
+    assertEquals(d1,d2,0.001);
+  }
+  
+  public void testGetDistance3() {
+    StringDistance sd = new NGramDistance(3);
+    float d = sd.getDistance("al", "al");
+    assertEquals(d,1.0f,0.001);
+    d = sd.getDistance("a", "a");
+    assertEquals(d,1.0f,0.001);
+    d = sd.getDistance("b", "a");
+    assertEquals(d,0.0f,0.001);
+    d = sd.getDistance("martha", "marhta");
+    assertEquals(d,0.7222,0.001);
+    d = sd.getDistance("jones", "johnson");
+    assertEquals(d,0.4762,0.001);
+    d = sd.getDistance("natural", "contrary");
+    assertEquals(d,0.2083,0.001);
+    d = sd.getDistance("abcvwxyz", "cabvwxyz");
+    assertEquals(d,0.5625,0.001);    
+    d = sd.getDistance("dwayne", "duane");
+    assertEquals(d,0.5277,0.001);
+    d = sd.getDistance("dixon", "dicksonx");
+    assertEquals(d,0.4583,0.001);
+    d = sd.getDistance("six", "ten");
+    assertEquals(d,0,0.001);
+    float d1 = sd.getDistance("zac ephron", "zac efron");
+    float d2 = sd.getDistance("zac ephron", "kai ephron");
+    assertTrue(d1 > d2);
+    d1 = sd.getDistance("brittney spears", "britney spears");
+    d2 = sd.getDistance("brittney spears", "brittney startzman");
+    assertTrue(d1 > d2);
+    d1 = sd.getDistance("0012345678", "0012890678");
+    d2 = sd.getDistance("0012345678", "0072385698");
+    assertTrue(d1 < d2);
+  }
+
+  public void testEmpty() throws Exception {
+    StringDistance nsd = new NGramDistance(1);
+    float d = nsd.getDistance("", "al");
+    assertEquals(d,0.0f,0.001);
+  }
+}

Modified: lucene/java/trunk/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java?rev=776704&r1=776703&r2=776704&view=diff
==============================================================================
--- lucene/java/trunk/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java
(original)
+++ lucene/java/trunk/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java
Wed May 20 14:07:08 2009
@@ -82,6 +82,12 @@
     spellChecker.setAccuracy(0.8f);
     checkCommonSuggestions(r);
     checkJaroWinklerSuggestions();
+    
+    spellChecker.setStringDistance(new NGramDistance(2));
+    spellChecker.setAccuracy(0.5f);
+    checkCommonSuggestions(r);
+    checkNGramSuggestions();
+    
   }
 
   private void checkCommonSuggestions(IndexReader r) throws IOException {
@@ -168,6 +174,14 @@
     String[] similar = spellChecker.suggestSimilar("onety", 2);
     assertEquals(2, similar.length);
     assertEquals(similar[0], "one");
+    assertEquals(similar[1], "ninety");
+  }
+  
+  private void checkNGramSuggestions() throws IOException {
+    String[] similar = spellChecker.suggestSimilar("onety", 2);
+    assertEquals(2, similar.length);
+    assertEquals(similar[0], "one");
+    assertEquals(similar[1], "ninety");
   }
 
   private void addwords(IndexReader r, String field) throws IOException {



Mime
View raw message