This implementation of the Levenshtein distance algorithm >> - * is from > = href=3D"http://www.merriampark.com/ld.htm">http://www.merriampark.com/ >> ld. >> htm

>> + *This implementation of the Levenshtein distance algorithm >> was originally based >> upon the one >> + * presented at > href=3D"http://www.merriampark.com/ld.htm">http://www.merriampark.co >> m/ld.htm

>> * >> *>> * StringUtils.getLevenshteinDistance(null, *) =3D >> IllegalArgumentException >> @@ -4281,76 +4281,64 @@ >> if (s =3D=3D null || t =3D=3D null) { >> throw new IllegalArgumentException("Strings must not be = >> null"); >> } >> - int d[][]; // matrix >> - int n; // length of s >> - int m; // length of t >> - int i; // iterates through s >> - int j; // iterates through t >> - char s_i; // ith character of s >> - char t_j; // jth character of t >> - int cost; // cost >> - >> - // Step 1 >> - n =3D s.length(); >> - m =3D t.length(); >> + >> + /* >> + The difference between this impl. and the previous is >> that, rather than cr >> eating and retaining a matrix of size >> + s.length()+1 by t.length()+1, we maintain two >> single-dimensional arrays of >> length s.length()+1. The first, d, >> + is the 'current working' distance array that=20 >> + maintains >> the newest distance >> cost counts as we iterate through >> + the characters of String s. Each time we increment >> the >> index of String t >> we are comparing, d is copied to p, >> + the second int[]. Doing so allows us to retain the >> previous cost counts a >> s required by the algorithm >> + (taking the minimum of the cost count to the left, up >> one, and diagonally >> up and to the left of the current >> + cost count being calculated). (Note that the arrays >> aren't really copied >> anymore, just switched...this is clearly much >> + better than cloning an array or doing a >> System.arraycopy() each time throu >> gh the outer loop.) >> + >> + Effectively, the difference between the two >> implementations is this one do >> es not cause an out of memory condition >> + when calculating the LD over two very large strings. >> + */ >> + >> + int n =3D s.length(); // length of s >> + int m =3D t.length(); // length of t >> + >> if (n =3D=3D 0) { >> return m; >> - } >> - if (m =3D=3D 0) { >> + } else if (m =3D=3D 0) { >> return n; >> } >> - d =3D new int[n + 1][m + 1]; >> >> - // Step 2 >> - for (i =3D 0; i <=3D n; i++) { >> - d[i][0] =3D i; >> - } >> + int p[] =3D new int[n+1]; //'previous' cost array, = horizontally >> + int d[] =3D new int[n+1]; // cost array, horizontally >> + int _d[]; //placeholder to assist in swapping p and d >> >> - for (j =3D 0; j <=3D m; j++) { >> - d[0][j] =3D j; >> - } >> - >> - // Step 3 >> - for (i =3D 1; i <=3D n; i++) { >> - s_i =3D s.charAt(i - 1); >> - >> - // Step 4 >> - for (j =3D 1; j <=3D m; j++) { >> - t_j =3D t.charAt(j - 1); >> - >> - // Step 5 >> - if (s_i =3D=3D t_j) { >> - cost =3D 0; >> - } else { >> - cost =3D 1; >> - } >> + //indexes into strings s and t >> + int i; // iterates through s >> + int j; // iterates through t >> >> - // Step 6 >> - d[i][j] =3D min(d[i - 1][j] + 1, d[i][j - 1] + 1, = d[i - >> 1][j - 1] + cost); >> - } >> - } >> + char t_j; // jth character of t >> >> - // Step 7 >> - return d[n][m]; >> - } >> + int cost; // cost >> >> - /** >> - *Gets the minimum of three

>> - * >> - * @param a value 1 >> - * @param b value 2 >> - * @param c value 3 >> - * @return the smallest of the values >> - */ >> - private static int min(int a, int b, int c) { >> - // Method copied from NumberUtils to avoid dependency on >> subpackage >> - if (b < a) { >> - a =3D b; >> - } >> - if (c < a) { >> - a =3D c; >> + for (i =3D 0; i<=3Dn; i++) { >> + p[i] =3D i; >> } >> - return a; >> + >> + for (j =3D 1; j<=3Dm; j++) { >> + t_j =3D t.charAt(j-1); >> + d[0] =3D j; >> + >> + for (i=3D1; i<=3Dn; i++) { >> + cost =3D s.charAt(i-1)=3D=3Dt_j ? 0 : 1; >> + >> + d[i] =3D Math.min(Math.min(d[i-1]+1, p[i]+1), >> p[i-1]+cost); //minimum of c >> ell to the left+1, to the top+1, diagonally left and up +cost >> + } >> + >> + //copy current distance counts to 'previous row'=20 >> + distance >> counts >> + _d =3D p; >> + p =3D d; >> + d =3D _d; >> + } >> + >> + //our last action in the above loop was to switch d and p,=20 >> + so >> p now actual >> ly has the most recent cost counts >> + return p[n]; >> } >> >> } >> >> >> --------------------------------------------------------------------- >> To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org >> For additional commands, e-mail: commons-dev-help@jakarta.apache.org >> > > > --------------------------------------------------------------------- > To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org > For additional commands, e-mail: commons-dev-help@jakarta.apache.org > --------------------------------------------------------------------- To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org For additional commands, e-mail: commons-dev-help@jakarta.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org For additional commands, e-mail: commons-dev-help@jakarta.apache.org`int`

values.