commons-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From chtom...@apache.org
Subject [text] TEXT-22: Incorporating https://github.com/apache/commons-lang/commit/103b64a373256feae6ca85f2bf220e7694e48fa4#diff-3f4c835875ddc8a211c0272e8be51394
Date Fri, 02 Dec 2016 19:13:39 GMT
Repository: commons-text
Updated Branches:
  refs/heads/master 6d69377a2 -> a37bead30


TEXT-22: Incorporating https://github.com/apache/commons-lang/commit/103b64a373256feae6ca85f2bf220e7694e48fa4#diff-3f4c835875ddc8a211c0272e8be51394


Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/a37bead3
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/a37bead3
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/a37bead3

Branch: refs/heads/master
Commit: a37bead303adb298f59f76389d5704bef6ff183d
Parents: 6d69377
Author: Rob Tompkins <chtompki@gmail.com>
Authored: Thu Dec 1 15:31:37 2016 -0800
Committer: Rob Tompkins <chtompki@gmail.com>
Committed: Thu Dec 1 15:31:37 2016 -0800

----------------------------------------------------------------------
 .../text/similarity/LevenshteinDistance.java    | 45 ++++++--------------
 1 file changed, 13 insertions(+), 32 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-text/blob/a37bead3/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
index e8628be..01ec3dc 100644
--- a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
+++ b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
@@ -307,12 +307,10 @@ public class LevenshteinDistance implements EditDistance<Integer>
{
      * <p>A higher score indicates a greater distance.</p>
      *
      * <p>The previous implementation of the Levenshtein distance algorithm
-     * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p>
+     * was from <a href="https://web.archive.org/web/20120526085419/http://www.merriampark.com/ldjava.htm">
+     * https://web.archive.org/web/20120526085419/http://www.merriampark.com/ldjava.htm</a></p>
      *
-     * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError
-     * which can occur when my Java implementation is used with very large strings.<br>
-     * This implementation of the Levenshtein distance algorithm
-     * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p>
+     * <p>This implementation only need one single-dimensional arrays of length s.length()
+ 1</p>
      *
      * <pre>
      * unlimitedCompare(null, *)             = IllegalArgumentException
@@ -339,20 +337,8 @@ public class LevenshteinDistance implements EditDistance<Integer>
{
         }
 
         /*
-           The difference between this impl. and the previous is that, rather
-           than creating and retaining a matrix of size s.length() + 1 by t.length() + 1,
-           we maintain two single-dimensional arrays of length s.length() + 1.  The first,
d,
-           is the 'current working' distance array that maintains the newest distance cost
-           counts as we iterate through the characters of String s.  Each time we increment
-           the index of String t we are comparing, d is copied to p, the second int[].  Doing
so
-           allows us to retain the previous cost counts as required by the algorithm (taking
-           the minimum of the cost count to the left, up one, and diagonally up and to the
left
-           of the current cost count being calculated).  (Note that the arrays aren't really
-           copied anymore, just switched...this is clearly much better than cloning an array
-           or doing a System.arraycopy() each time  through the outer loop.)
-
-           Effectively, the difference between the two implementations is this one does not
-           cause an out of memory condition when calculating the LD over two very large strings.
+           This implementation use two variable to record the previous cost counts,
+           So this implementation use less memory than previous impl.
          */
 
         int n = left.length(); // length of left
@@ -373,16 +359,15 @@ public class LevenshteinDistance implements EditDistance<Integer>
{
             m = right.length();
         }
 
-        int[] p = new int[n + 1]; //'previous' cost array, horizontally
-        int[] d = new int[n + 1]; // cost array, horizontally
-        int[] tempD; //placeholder to assist in swapping p and d
+        int[] p = new int[n + 1];
 
         // indexes into strings left and right
         int i; // iterates through left
         int j; // iterates through right
+        int upper_left;
+        int upper;
 
         char rightJ; // jth character of right
-
         int cost; // cost
 
         for (i = 0; i <= n; i++) {
@@ -390,23 +375,19 @@ public class LevenshteinDistance implements EditDistance<Integer>
{
         }
 
         for (j = 1; j <= m; j++) {
+            upper_left = p[0];
             rightJ = right.charAt(j - 1);
-            d[0] = j;
+            p[0] = j;
 
             for (i = 1; i <= n; i++) {
+                upper = p[i];
                 cost = left.charAt(i - 1) == rightJ ? 0 : 1;
                 // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
-                d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost);
+                p[i] = Math.min(Math.min(p[i - 1] + 1, p[i] + 1), upper_left + cost);
+                upper_left = upper;
             }
-
-            // copy current distance counts to 'previous row' distance counts
-            tempD = p;
-            p = d;
-            d = tempD;
         }
 
-        // our last action in the above loop was to switch d and p, so p now
-        // actually has the most recent cost counts
         return p[n];
     }
 


Mime
View raw message