lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r827800 - in /lucene/java/trunk: CHANGES.txt src/java/org/apache/lucene/search/FuzzyTermEnum.java
Date Tue, 20 Oct 2009 21:22:52 GMT
Author: mikemccand
Date: Tue Oct 20 21:22:52 2009
New Revision: 827800

URL: http://svn.apache.org/viewvc?rev=827800&view=rev
Log:
LUCENE-1183: optimize Levenshtein distance computation in FuzzyQuery

Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/src/java/org/apache/lucene/search/FuzzyTermEnum.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=827800&r1=827799&r2=827800&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Tue Oct 20 21:22:52 2009
@@ -121,6 +121,9 @@
 
 Optimizations
 
+* LUCENE-1183: Optimize Levenshtein Distance computation in
+  FuzzyQuery.  (Cédrik Lime via Mike McCandless)
+
 Documentation
 
 Build

Modified: lucene/java/trunk/src/java/org/apache/lucene/search/FuzzyTermEnum.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/FuzzyTermEnum.java?rev=827800&r1=827799&r2=827800&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/FuzzyTermEnum.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/FuzzyTermEnum.java Tue Oct 20 21:22:52
2009
@@ -17,11 +17,11 @@
  * limitations under the License.
  */
 
+import java.io.IOException;
+
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
 
-import java.io.IOException;
-
 /** Subclass of FilteredTermEnum for enumerating all terms that are similar
  * to the specified filter term.
  *
@@ -30,16 +30,11 @@
  */
 public final class FuzzyTermEnum extends FilteredTermEnum {
 
-  /* This should be somewhere around the average long word.
-   * If it is longer, we waste time and space. If it is shorter, we waste a
-   * little bit of time growing the array as we encounter longer words.
-   */
-  private static final int TYPICAL_LONGEST_WORD_IN_INDEX = 19;
-
   /* Allows us save time required to create a new array
    * every time similarity is called.
    */
-  private int[][] d;
+  private int[] p;
+  private int[] d;
 
   private float similarity;
   private boolean endEnum = false;
@@ -51,7 +46,6 @@
 
   private final float minimumSimilarity;
   private final float scale_factor;
-  private final int[] maxDistances = new int[TYPICAL_LONGEST_WORD_IN_INDEX];
 
   /**
    * Creates a FuzzyTermEnum with an empty prefix and a minSimilarity of 0.5f.
@@ -121,8 +115,8 @@
     this.text = searchTerm.text().substring(realPrefixLength);
     this.prefix = searchTerm.text().substring(0, realPrefixLength);
 
-    initializeMaxDistances();
-    this.d = initDistanceArray();
+    this.p = new int[this.text.length()+1];
+    this.d = new int[this.text.length()+1];
 
     setEnum(reader.terms(new Term(searchTerm.field(), prefix)));
   }
@@ -141,10 +135,12 @@
     return false;
   }
   
+  /** {@inheritDoc} */
   public final float difference() {
-    return (float)((similarity - minimumSimilarity) * scale_factor);
+    return (similarity - minimumSimilarity) * scale_factor;
   }
   
+  /** {@inheritDoc} */
   public final boolean endEnum() {
     return endEnum;
   }
@@ -154,18 +150,6 @@
    ******************************/
   
   /**
-   * Finds and returns the smallest of three integers 
-   */
-  private static final int min(int a, int b, int c) {
-    final int t = (a < b) ? a : b;
-    return (t < c) ? t : c;
-  }
-
-  private final int[][] initDistanceArray(){
-    return new int[this.text.length() + 1][TYPICAL_LONGEST_WORD_IN_INDEX];
-  }
-
-  /**
    * <p>Similarity returns a number that is 1.0f or less (including negative numbers)
    * based on how similar the Term is compared to a target term.  It returns
    * exactly 0.0f when
@@ -214,7 +198,7 @@
       return prefix.length() == 0 ? 0.0f : 1.0f - ((float) n / prefix.length());
     }
 
-    final int maxDistance = getMaxDistance(m);
+    final int maxDistance = calculateMaxDistance(m);
 
     if (maxDistance < Math.abs(m-n)) {
       //just adding the characters of m to n or vice-versa results in
@@ -227,56 +211,52 @@
       return 0.0f;
     }
 
-    //let's make sure we have enough room in our array to do the distance calculations.
-    if (d[0].length <= m) {
-      growDistanceArray(m);
+    // init matrix d
+    for (int i = 0; i<=n; ++i) {
+      p[i] = i;
     }
 
-    // init matrix d
-    for (int i = 0; i <= n; i++) d[i][0] = i;
-    for (int j = 0; j <= m; j++) d[0][j] = j;
-    
     // start computing edit distance
-    for (int i = 1; i <= n; i++) {
+    for (int j = 1; j<=m; ++j) { // iterates through target
       int bestPossibleEditDistance = m;
-      final char s_i = text.charAt(i - 1);
-      for (int j = 1; j <= m; j++) {
-        if (s_i != target.charAt(j-1)) {
-            d[i][j] = min(d[i-1][j], d[i][j-1], d[i-1][j-1])+1;
-        }
-        else {
-          d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]);
-        }
-        bestPossibleEditDistance = Math.min(bestPossibleEditDistance, d[i][j]);
+      final char t_j = target.charAt(j-1); // jth character of t
+      d[0] = j;
+
+      for (int i=1; i<=n; ++i) { // iterates through text
+        // minimum of cell to the left+1, to the top+1, diagonally left and up +(0|1)
+        if (t_j != text.charAt(i-1)) {
+          d[i] = Math.min(Math.min(d[i-1], p[i]),  p[i-1]) + 1;
+		} else {
+          d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1),  p[i-1]);
+		}
+        bestPossibleEditDistance = Math.min(bestPossibleEditDistance, d[i]);
       }
 
       //After calculating row i, the best possible edit distance
       //can be found by found by finding the smallest value in a given column.
       //If the bestPossibleEditDistance is greater than the max distance, abort.
 
-      if (i > maxDistance && bestPossibleEditDistance > maxDistance) {  //equal
is okay, but not greater
+      if (j > maxDistance && bestPossibleEditDistance > maxDistance) {  //equal
is okay, but not greater
         //the closest the target can be to the text is just too far away.
         //this target is leaving the party early.
         return 0.0f;
       }
+
+      // copy current distance counts to 'previous row' distance counts: swap p and d
+      int _d[] = p;
+      p = d;
+      d = _d;
     }
 
+    // our last action in the above loop was to switch d and p, so p now
+    // actually has the most recent cost counts
+
     // this will return less than 0.0 when the edit distance is
     // greater than the number of characters in the shorter word.
     // but this was the formula that was previously used in FuzzyTermEnum,
     // so it has not been changed (even though minimumSimilarity must be
     // greater than 0.0)
-    return 1.0f - ((float)d[n][m] / (float) (prefix.length() + Math.min(n, m)));
-  }
-
-  /**
-   * Grow the second dimension of the array, so that we can calculate the
-   * Levenshtein difference.
-   */
-  private void growDistanceArray(int m) {
-    for (int i = 0; i < d.length; i++) {
-      d[i] = new int[m+1];
-    }
+    return 1.0f - ((float)p[n] / (float) (prefix.length() + Math.min(n, m)));
   }
 
   /**
@@ -286,21 +266,14 @@
    * @param m the length of the "other value"
    * @return the maximum levenshtein distance that we care about
    */
-  private final int getMaxDistance(int m) {
-    return (m < maxDistances.length) ? maxDistances[m] : calculateMaxDistance(m);
-  }
-
-  private void initializeMaxDistances() {
-    for (int i = 0; i < maxDistances.length; i++) {
-      maxDistances[i] = calculateMaxDistance(i);
-    }
-  }
-  
   private int calculateMaxDistance(int m) {
     return (int) ((1-minimumSimilarity) * (Math.min(text.length(), m) + prefix.length()));
   }
 
+  /** {@inheritDoc} */
   public void close() throws IOException {
+    p = d = null;
+    searchTerm = null;
     super.close();  //call super.close() and let the garbage collector do its work.
   }
   



Mime
View raw message