>> * >> - *

This implementation of the Levenshtein distance algorithm >> - * is from > = href=3D"http://www.merriampark.com/ld.htm">http://www.merriampark.com/ >> ld. >> htm

>> + *

This implementation of the Levenshtein distance algorithm >> was originally based >> upon the one >> + * presented at > href=3D"http://www.merriampark.com/ld.htm">http://www.merriampark.co >> m/ld.htm

>> * >> *
```>>        * StringUtils.getLevenshteinDistance(null, *)             =3D
>> IllegalArgumentException
>> @@ -4281,76 +4281,64 @@
>>           if (s =3D=3D null || t =3D=3D null) {
>>               throw new IllegalArgumentException("Strings must not be =

>> null");
>>           }
>> -        int d[][]; // matrix
>> -        int n; // length of s
>> -        int m; // length of t
>> -        int i; // iterates through s
>> -        int j; // iterates through t
>> -        char s_i; // ith character of s
>> -        char t_j; // jth character of t
>> -        int cost; // cost
>> -
>> -        // Step 1
>> -        n =3D s.length();
>> -        m =3D t.length();
>> +
>> +               /*
>> +               The difference between this impl. and the previous is
>> that, rather than cr
>> eating and retaining a matrix of size
>> +               s.length()+1 by t.length()+1, we maintain two
>> single-dimensional arrays of
>>   length s.length()+1.  The first, d,
>> +               is the 'current working' distance array that=20
>> + maintains
>>   cost counts as we iterate through
>> +               the characters of String s.  Each time we increment
>> the
>> index of String t
>> we are comparing, d is copied to p,
>> +               the second int[].  Doing so allows us to retain the
>> previous cost counts a
>> s required by the algorithm
>> +               (taking the minimum of the cost count to the left, up
>> one, and diagonally
>> up and to the left of the current
>> +               cost count being calculated).  (Note that the arrays
>> aren't really copied
>> anymore, just switched...this is clearly much
>> +               better than cloning an array or doing a
>> System.arraycopy() each time throu
>> gh the outer loop.)
>> +
>> +               Effectively, the difference between the two
>> implementations is this one do
>> es not cause an out of memory condition
>> +               when calculating the LD over two very large strings.
>> +               */
>> +
>> +        int n =3D s.length(); // length of s
>> +        int m =3D t.length(); // length of t
>> +
>>           if (n =3D=3D 0) {
>>               return m;
>> -        }
>> -        if (m =3D=3D 0) {
>> +        } else if (m =3D=3D 0) {
>>               return n;
>>           }
>> -        d =3D new int[n + 1][m + 1];
>>
>> -        // Step 2
>> -        for (i =3D 0; i <=3D n; i++) {
>> -            d[i][0] =3D i;
>> -        }
>> +        int p[] =3D new int[n+1]; //'previous' cost array, =
horizontally
>> +        int d[] =3D new int[n+1]; // cost array, horizontally
>> +               int _d[]; //placeholder to assist in swapping p and d
>>
>> -        for (j =3D 0; j <=3D m; j++) {
>> -            d[0][j] =3D j;
>> -        }
>> -
>> -        // Step 3
>> -        for (i =3D 1; i <=3D n; i++) {
>> -            s_i =3D s.charAt(i - 1);
>> -
>> -            // Step 4
>> -            for (j =3D 1; j <=3D m; j++) {
>> -                t_j =3D t.charAt(j - 1);
>> -
>> -                // Step 5
>> -                if (s_i =3D=3D t_j) {
>> -                    cost =3D 0;
>> -                } else {
>> -                    cost =3D 1;
>> -                }
>> +        //indexes into strings s and t
>> +        int i; // iterates through s
>> +        int j; // iterates through t
>>
>> -                // Step 6
>> -                d[i][j] =3D min(d[i - 1][j] + 1, d[i][j - 1] + 1, =
d[i -
>> 1][j - 1] + cost);
>> -            }
>> -        }
>> +        char t_j; // jth character of t
>>
>> -        // Step 7
>> -        return d[n][m];
>> -    }
>> +        int cost; // cost
>>
>> -    /**
>> -     * Gets the minimum of three `int` values.
>> -     *
>> -     * @param a  value 1
>> -     * @param b  value 2
>> -     * @param c  value 3
>> -     * @return  the smallest of the values
>> -     */
>> -    private static int min(int a, int b, int c) {
>> -        // Method copied from NumberUtils to avoid dependency on
>> subpackage
>> -        if (b < a) {
>> -            a =3D b;
>> -        }
>> -        if (c < a) {
>> -            a =3D c;
>> +        for (i =3D 0; i<=3Dn; i++) {
>> +            p[i] =3D i;
>>           }
>> -        return a;
>> +
>> +        for (j =3D 1; j<=3Dm; j++) {
>> +            t_j =3D t.charAt(j-1);
>> +            d[0] =3D j;
>> +
>> +            for (i=3D1; i<=3Dn; i++) {
>> +                cost =3D s.charAt(i-1)=3D=3Dt_j ? 0 : 1;
>> +
>> +                d[i] =3D Math.min(Math.min(d[i-1]+1, p[i]+1),
>> p[i-1]+cost);  //minimum of c
>> ell to the left+1, to the top+1, diagonally left and up +cost
>> +            }
>> +
>> +            //copy current distance counts to 'previous row'=20
>> + distance
>> counts
>> +            _d =3D p;
>> +            p =3D d;
>> +            d =3D _d;
>> +        }
>> +
>> +        //our last action in the above loop was to switch d and p,=20
>> + so
>> p now actual
>> ly has the most recent cost counts
>> +        return p[n];
>>       }
>>
>>   }
>>
>>
>> ---------------------------------------------------------------------
>> To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org
>> For additional commands, e-mail: commons-dev-help@jakarta.apache.org
>>
>
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org
> For additional commands, e-mail: commons-dev-help@jakarta.apache.org
>

---------------------------------------------------------------------
To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org