commons-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ki...@apache.org
Subject [text] SANDBOX-491: Allow extra information (e.g. Levenshtein threshold) to be stored as (final) fields in the StringMetric instance. This fixes #1 from github. Thanks to Jonathan Baker.
Date Fri, 20 Mar 2015 01:32:23 GMT
Repository: commons-text
Updated Branches:
  refs/heads/master d9d330542 -> 75cdc00af


SANDBOX-491: Allow extra information (e.g. Levenshtein threshold) to be stored as (final)
fields in the StringMetric instance. This fixes #1 from github. Thanks to Jonathan Baker.


Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/75cdc00a
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/75cdc00a
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/75cdc00a

Branch: refs/heads/master
Commit: 75cdc00afc674aca39a4fca4c1745d029da43648
Parents: d9d3305
Author: Bruno P. Kinoshita <brunodepaulak@yahoo.com.br>
Authored: Thu Mar 19 22:28:44 2015 -0300
Committer: Bruno P. Kinoshita <brunodepaulak@yahoo.com.br>
Committed: Thu Mar 19 22:30:02 2015 -0300

----------------------------------------------------------------------
 src/changes/changes.xml                         |   1 +
 .../commons/text/similarity/FuzzyScore.java     |  36 +--
 .../text/similarity/LevenshteinDistance.java    | 229 +++++++++++++++----
 .../commons/text/similarity/FuzzyScoreTest.java |  47 ++--
 .../similarity/JaroWrinklerDistanceTest.java    |   2 +-
 .../similarity/LevenshteinDistanceTest.java     | 113 ++-------
 .../ParameterizedLevenshteinDistanceTest.java   | 125 ++++++++++
 7 files changed, 372 insertions(+), 181 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-text/blob/75cdc00a/src/changes/changes.xml
----------------------------------------------------------------------
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 2f87c53..fa6417f 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -22,6 +22,7 @@
   <body>
 
   <release version="1.0" date="tba" description="tba">
+    <action issue="SANDBOX-491" type="fix" dev="kinow" due-to="Jonathan Baker">Allow
extra information (e.g. Levenshtein threshold) to be stored as (final) fields in the StringMetric
instance.</action>
     <action issue="SANDBOX-486" type="add" dev="kinow">Port Myers algorithm from [collections]</action>
     <action issue="SANDBOX-485" type="add" dev="kinow">Add Hamming distance</action>
     <action issue="SANDBOX-483" type="add" dev="kinow" due-to="britter">Incorporate
String algorithms from Commons Lang</action>

http://git-wip-us.apache.org/repos/asf/commons-text/blob/75cdc00a/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java b/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java
index 3e72d05..3cf6df2 100644
--- a/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java
+++ b/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java
@@ -33,21 +33,22 @@ import java.util.Locale;
  */
 public class FuzzyScore implements StringMetric<Integer> {
 
+    private final Locale locale;
+
+
     /**
-     * <p>
-     * Find the Fuzzy Score which indicates the similarity score between two
-     * Strings. This method uses the default locale.
-     * </p>
+     * <p>This returns a {@link Locale}-specific {@link FuzzyScore}.</p>
      *
-     * @param term a full term that should be matched against, must not be null
-     * @param query the query that will be matched against a term, must not be
-     *            null
-     * @return result score
-     * @throws IllegalArgumentException if either String input {@code null}
+     * @param locale The string matching logic is case insensitive.
+                     A {@link Locale} is necessary to normalize both Strings to lower case.
+     * @throws IllegalArgumentException
+     *         This is thrown if the {@link Locale} parameter is {@code null}.
      */
-    @Override
-    public Integer compare(CharSequence term, CharSequence query) {
-        return compare(term, query, Locale.getDefault());
+    public FuzzyScore(final Locale locale) {
+        if (locale == null) {
+            throw new IllegalArgumentException("Locale must not be null");
+        }
+        this.locale = locale;
     }
 
     /**
@@ -70,17 +71,14 @@ public class FuzzyScore implements StringMetric<Integer> {
      * @param term a full term that should be matched against, must not be null
      * @param query the query that will be matched against a term, must not be
      *            null
-     * @param locale This string matching logic is case insensitive. A locale is
-     *            necessary to normalize both Strings to lower case.
      * @return result score
      * @throws IllegalArgumentException if either String input {@code null} or
      *             Locale input {@code null}
      */
-    public Integer compare(CharSequence term, CharSequence query, Locale locale) {
+    @Override
+    public Integer compare(CharSequence term, CharSequence query) {
         if (term == null || query == null) {
             throw new IllegalArgumentException("Strings must not be null");
-        } else if (locale == null) {
-            throw new IllegalArgumentException("Locale must not be null");
         }
 
         // fuzzy logic is case insensitive. We normalize the Strings to lower
@@ -130,4 +128,8 @@ public class FuzzyScore implements StringMetric<Integer> {
         return score;
     }
 
+    public Locale getLocale() {
+        return locale;
+    }
+
 }

http://git-wip-us.apache.org/repos/asf/commons-text/blob/75cdc00a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
index 4bc2e8a..38e5f14 100644
--- a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
+++ b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
@@ -33,27 +33,52 @@ import java.util.Arrays;
  */
 public class LevenshteinDistance implements StringMetric<Integer> {
 
+    private static final LevenshteinDistance DEFAULT_INSTANCE = new LevenshteinDistance();
+
+    private final Integer threshold;
+
     /**
-     * Find the Levenshtein distance between two Strings.
-     *
-     * <p>A higher score indicates a greater distance.</p>
-     *
      * <p>
-     * The previous implementation of the Levenshtein distance algorithm was
-     * from <a
-     * href="http://www.merriampark.com/ld.htm">http://www.merriampark.com
-     * /ld.htm</a>
+     * This returns the default instance that uses a version
+     * of the algorithm that does not use a threshold parameter.
      * </p>
      *
+     * @see {@link #getDefaultInstance()}
+     */
+    public LevenshteinDistance() {
+        this(null);
+    }
+
+    /**
      * <p>
-     * Chas Emerick has written an implementation in Java, which avoids an
-     * OutOfMemoryError which can occur when my Java implementation is used with
-     * very large strings.<br>
-     * This implementation of the Levenshtein distance algorithm is from <a
-     * href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/
-     * ldjava.htm</a>
+     * If the threshold is not null, distance calculations will be limited to a maximum length.
+     * If the threshold is null, the unlimited version of the algorithm will be used.
      * </p>
      *
+     * @param threshold
+     *        If this is null then distances calculations will not be limited.
+     *        This may not be negative.
+     */
+    public LevenshteinDistance(final Integer threshold) {
+        if (threshold != null && threshold < 0) {
+            throw new IllegalArgumentException("Threshold must not be negative");
+        }
+        this.threshold = threshold;
+    }
+
+    /**
+     * <p>Find the Levenshtein distance between two Strings.</p>
+     *
+     * <p>A higher score indicates a greater distance.</p>
+     *
+     * <p>The previous implementation of the Levenshtein distance algorithm
+     * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p>
+     *
+     * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError
+     * which can occur when my Java implementation is used with very large strings.<br>
+     * This implementation of the Levenshtein distance algorithm
+     * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p>
+     *
      * <pre>
      * distance.compare(null, *)             = IllegalArgumentException
      * distance.compare(*, null)             = IllegalArgumentException
@@ -70,12 +95,23 @@ public class LevenshteinDistance implements StringMetric<Integer>
{
      *
      * @param left the first string, must not be null
      * @param right the second string, must not be null
-     * @return result distance
+     * @return result distance, or -1
      * @throws IllegalArgumentException if either String input {@code null}
      */
-    @Override
     public Integer compare(CharSequence left, CharSequence right) {
-        return compare(left, right, Integer.MAX_VALUE);
+        if (threshold != null) {
+            return limitedCompare(left, right, threshold);
+        } else {
+            return unlimitedCompare(left, right);
+        }
+    }
+
+    public static LevenshteinDistance getDefaultInstance() {
+        return DEFAULT_INSTANCE;
+    }
+
+    public Integer getThreshold() {
+        return threshold;
     }
 
     /**
@@ -91,27 +127,25 @@ public class LevenshteinDistance implements StringMetric<Integer>
{
      * </p>
      *
      * <pre>
-     * distance.compare(null, *, *)             = IllegalArgumentException
-     * distance.compare(*, null, *)             = IllegalArgumentException
-     * distance.compare(*, *, -1)               = IllegalArgumentException
-     * distance.compare("","", 0)               = 0
-     * distance.compare("aaapppp", "", 8)       = 7
-     * distance.compare("aaapppp", "", 7)       = 7
-     * distance.compare("aaapppp", "", 6))      = -1
-     * distance.compare("elephant", "hippo", 7) = 7
-     * distance.compare("elephant", "hippo", 6) = -1
-     * distance.compare("hippo", "elephant", 7) = 7
-     * distance.compare("hippo", "elephant", 6) = -1
+     * limitedCompare(null, *, *)             = IllegalArgumentException
+     * limitedCompare(*, null, *)             = IllegalArgumentException
+     * limitedCompare(*, *, -1)               = IllegalArgumentException
+     * limitedCompare("","", 0)               = 0
+     * limitedCompare("aaapppp", "", 8)       = 7
+     * limitedCompare("aaapppp", "", 7)       = 7
+     * limitedCompare("aaapppp", "", 6))      = -1
+     * limitedCompare("elephant", "hippo", 7) = 7
+     * limitedCompare("elephant", "hippo", 6) = -1
+     * limitedCompare("hippo", "elephant", 7) = 7
+     * limitedCompare("hippo", "elephant", 6) = -1
      * </pre>
      *
      * @param left the first string, must not be null
      * @param right the second string, must not be null
      * @param threshold the target threshold, must not be negative
-     * @return result distance
-     * @throws IllegalArgumentException if either String input {@code null} or
-     *             negative threshold
+     * @return result distance, or -1
      */
-    public Integer compare(CharSequence left, CharSequence right, int threshold) {
+    private static int limitedCompare(CharSequence left, CharSequence right, int threshold)
{
         if (left == null || right == null) {
             throw new IllegalArgumentException("Strings must not be null");
         }
@@ -124,7 +158,7 @@ public class LevenshteinDistance implements StringMetric<Integer>
{
          * equal to the threshold value, returning -1 if it's greater. The
          * advantage is performance: unbounded distance is O(nm), but a bound of
          * k allows us to reduce it to O(km) time by only computing a diagonal
-         * stripe of width 2k + 1 of the cost table. It is also possible to use
+         * stripe of width 2k + 1 of the cost table. It is also possible to use* this to
compute the unbounded Levenshtein distance by starting the
          * this to compute the unbounded Levenshtein distance by starting the
          * threshold at 1 and doubling each time until the distance is found;
          * this is O(dm), where d is the distance.
@@ -143,8 +177,16 @@ public class LevenshteinDistance implements StringMetric<Integer>
{
          * and our threshold is 1. In this case we're going to walk a stripe of
          * length 3. The matrix would look like so:
          *
-         * 1 2 3 4 5 1 |#|#| | | | 2 |#|#|#| | | 3 | |#|#|#| | 4 | | |#|#|#| 5 |
-         * | | |#|#| 6 | | | | |#| 7 | | | | | |
+         * <pre>
+         *    1 2 3 4 5
+         * 1 |#|#| | | |
+         * 2 |#|#|#| | |
+         * 3 | |#|#|#| |
+         * 4 | | |#|#|#|
+         * 5 | | | |#|#|
+         * 6 | | | | |#|
+         * 7 | | | | | |
+         * </pre>
          *
          * Note how the stripe leads off the table as there is no possible way
          * to turn a string of length 5 into one of length 7 in edit distance of
@@ -161,8 +203,8 @@ public class LevenshteinDistance implements StringMetric<Integer>
{
          * some discussion.
          */
 
-        int n = left.length(); // length of s
-        int m = right.length(); // length of t
+        int n = left.length(); // length of left
+        int m = right.length(); // length of right
 
         // if one string is empty, the edit distance is necessarily the length
         // of the other
@@ -197,7 +239,7 @@ public class LevenshteinDistance implements StringMetric<Integer>
{
 
         // iterates through t
         for (int j = 1; j <= m; j++) {
-            final char t_j = right.charAt(j - 1); // jth character of t
+            final char right_j = right.charAt(j - 1); // jth character of right
             d[0] = j;
 
             // compute stripe indices, constrain to array size
@@ -218,7 +260,7 @@ public class LevenshteinDistance implements StringMetric<Integer>
{
 
             // iterates through [min, max] in s
             for (int i = min; i <= max; i++) {
-                if (left.charAt(i - 1) == t_j) {
+                if (left.charAt(i - 1) == right_j) {
                     // diagonally left and up
                     d[i] = p[i - 1];
                 } else {
@@ -243,4 +285,113 @@ public class LevenshteinDistance implements StringMetric<Integer>
{
         return -1;
     }
 
+    /**
+     * <p>Find the Levenshtein distance between two Strings.</p>
+     *
+     * <p>A higher score indicates a greater distance.</p>
+     *
+     * <p>The previous implementation of the Levenshtein distance algorithm
+     * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p>
+     *
+     * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError
+     * which can occur when my Java implementation is used with very large strings.<br>
+     * This implementation of the Levenshtein distance algorithm
+     * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p>
+     *
+     * <pre>
+     * unlimitedCompare(null, *)             = IllegalArgumentException
+     * unlimitedCompare(*, null)             = IllegalArgumentException
+     * unlimitedCompare("","")               = 0
+     * unlimitedCompare("","a")              = 1
+     * unlimitedCompare("aaapppp", "")       = 7
+     * unlimitedCompare("frog", "fog")       = 1
+     * unlimitedCompare("fly", "ant")        = 3
+     * unlimitedCompare("elephant", "hippo") = 7
+     * unlimitedCompare("hippo", "elephant") = 7
+     * unlimitedCompare("hippo", "zzzzzzzz") = 8
+     * unlimitedCompare("hello", "hallo")    = 1
+     * </pre>
+     *
+     * @param left the first String, must not be null
+     * @param right the second String, must not be null
+     * @return result distance, or -1
+     * @throws IllegalArgumentException if either String input {@code null}
+     */
+    private static int unlimitedCompare(CharSequence left, CharSequence right) {
+        if (left == null || right == null) {
+            throw new IllegalArgumentException("Strings must not be null");
+        }
+
+        /*
+           The difference between this impl. and the previous is that, rather
+           than creating and retaining a matrix of size s.length() + 1 by t.length() + 1,
+           we maintain two single-dimensional arrays of length s.length() + 1.  The first,
d,
+           is the 'current working' distance array that maintains the newest distance cost
+           counts as we iterate through the characters of String s.  Each time we increment
+           the index of String t we are comparing, d is copied to p, the second int[].  Doing
so
+           allows us to retain the previous cost counts as required by the algorithm (taking
+           the minimum of the cost count to the left, up one, and diagonally up and to the
left
+           of the current cost count being calculated).  (Note that the arrays aren't really
+           copied anymore, just switched...this is clearly much better than cloning an array
+           or doing a System.arraycopy() each time  through the outer loop.)
+
+           Effectively, the difference between the two implementations is this one does not
+           cause an out of memory condition when calculating the LD over two very large strings.
+         */
+
+        int n = left.length(); // length of left
+        int m = right.length(); // length of right
+
+        if (n == 0) {
+            return m;
+        } else if (m == 0) {
+            return n;
+        }
+
+        if (n > m) {
+            // swap the input strings to consume less memory
+            final CharSequence tmp = left;
+            left = right;
+            right = tmp;
+            n = m;
+            m = right.length();
+        }
+
+        int p[] = new int[n + 1]; //'previous' cost array, horizontally
+        int d[] = new int[n + 1]; // cost array, horizontally
+        int _d[]; //placeholder to assist in swapping p and d
+
+        // indexes into strings left and right
+        int i; // iterates through left
+        int j; // iterates through right
+
+        char right_j; // jth character of right
+
+        int cost; // cost
+
+        for (i = 0; i <= n; i++) {
+            p[i] = i;
+        }
+
+        for (j = 1; j <= m; j++) {
+            right_j = right.charAt(j - 1);
+            d[0] = j;
+
+            for (i = 1; i <= n; i++) {
+                cost = left.charAt(i - 1) == right_j ? 0 : 1;
+                // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
+                d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost);
+            }
+
+            // copy current distance counts to 'previous row' distance counts
+            _d = p;
+            p = d;
+            d = _d;
+        }
+
+        // our last action in the above loop was to switch d and p, so p now
+        // actually has the most recent cost counts
+        return p[n];
+    }
+
 }

http://git-wip-us.apache.org/repos/asf/commons-text/blob/75cdc00a/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java b/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java
index b2fab14..88778fc 100644
--- a/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java
+++ b/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java
@@ -20,56 +20,45 @@ import static org.junit.Assert.assertEquals;
 
 import java.util.Locale;
 
-import org.junit.BeforeClass;
 import org.junit.Test;
 
 /**
- * Unit tests for {@link org.apache.commons.text.FuzzyScore}.
+ * Unit tests for {@link org.apache.commons.text.similarity.FuzzyScore}.
  */
 public class FuzzyScoreTest {
 
-    private static FuzzyScore score;
-
-    @BeforeClass
-    public static void setUp() {
-        score = new FuzzyScore();
-    }
+    private static final FuzzyScore ENGLISH_SCORE = new FuzzyScore(Locale.ENGLISH);
 
     @Test
     public void testGetFuzzyScore() throws Exception {
-        assertEquals(0, (int) score.compare("", "", Locale.ENGLISH));
-        assertEquals(0,
-                (int) score.compare("Workshop", "b", Locale.ENGLISH));
-        assertEquals(1,
-                (int) score.compare("Room", "o", Locale.ENGLISH));
-        assertEquals(1,
-                (int) score.compare("Workshop", "w", Locale.ENGLISH));
-        assertEquals(2,
-                (int) score.compare("Workshop", "ws", Locale.ENGLISH));
-        assertEquals(4,
-                (int) score.compare("Workshop", "wo", Locale.ENGLISH));
-        assertEquals(3, (int) score.compare(
-                "Apache Software Foundation", "asf", Locale.ENGLISH));
+        assertEquals(0, (int) ENGLISH_SCORE.compare("", ""));
+        assertEquals(0, (int) ENGLISH_SCORE.compare("Workshop", "b"));
+        assertEquals(1, (int) ENGLISH_SCORE.compare("Room", "o"));
+        assertEquals(1, (int) ENGLISH_SCORE.compare("Workshop", "w"));
+        assertEquals(2, (int) ENGLISH_SCORE.compare("Workshop", "ws"));
+        assertEquals(4, (int) ENGLISH_SCORE.compare("Workshop", "wo"));
+        assertEquals(3, (int) ENGLISH_SCORE.compare(
+            "Apache Software Foundation", "asf"));
     }
 
     @Test(expected = IllegalArgumentException.class)
-    public void testGetFuzzyScore_NullNullNull() throws Exception {
-        score.compare(null, null, null);
+    public void testGetFuzzyScore_StringNullLocale() throws Exception {
+        ENGLISH_SCORE.compare("not null", null);
     }
 
     @Test(expected = IllegalArgumentException.class)
-    public void testGetFuzzyScore_StringNullLoclae() throws Exception {
-        score.compare(" ", null, Locale.ENGLISH);
+    public void testGetFuzzyScore_NullStringLocale() throws Exception {
+        ENGLISH_SCORE.compare(null, "not null");
     }
 
     @Test(expected = IllegalArgumentException.class)
-    public void testGetFuzzyScore_NullStringLocale() throws Exception {
-        score.compare(null, "clear", Locale.ENGLISH);
+    public void testGetFuzzyScore_NullNullLocale() throws Exception {
+        ENGLISH_SCORE.compare(null, null);
     }
 
     @Test(expected = IllegalArgumentException.class)
-    public void testGetFuzzyScore_StringStringNull() throws Exception {
-        score.compare(" ", "clear", null);
+    public void testMissingLocale() throws Exception {
+        FuzzyScore score = new FuzzyScore((Locale) null);
     }
 
 }

http://git-wip-us.apache.org/repos/asf/commons-text/blob/75cdc00a/src/test/java/org/apache/commons/text/similarity/JaroWrinklerDistanceTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/text/similarity/JaroWrinklerDistanceTest.java
b/src/test/java/org/apache/commons/text/similarity/JaroWrinklerDistanceTest.java
index 73e18f7..7050b05 100644
--- a/src/test/java/org/apache/commons/text/similarity/JaroWrinklerDistanceTest.java
+++ b/src/test/java/org/apache/commons/text/similarity/JaroWrinklerDistanceTest.java
@@ -22,7 +22,7 @@ import org.junit.BeforeClass;
 import org.junit.Test;
 
 /**
- * Unit tests for {@link org.apache.commons.text.JaroWrinklerDistance}.
+ * Unit tests for {@link org.apache.commons.text.similarity.JaroWrinklerDistance}.
  */
 public class JaroWrinklerDistanceTest {
 

http://git-wip-us.apache.org/repos/asf/commons-text/blob/75cdc00a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java
b/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java
index 1d4e6fa..814677d 100644
--- a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java
+++ b/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java
@@ -18,129 +18,52 @@ package org.apache.commons.text.similarity;
 
 import static org.junit.Assert.assertEquals;
 
-import org.junit.BeforeClass;
 import org.junit.Test;
 
 /**
- * Unit tests for {@link org.apache.commons.text.LevenshteinDistance}.
+ * Unit tests for {@link org.apache.commons.text.similarity.LevenshteinDistance}.
  */
 public class LevenshteinDistanceTest {
 
-    private static LevenshteinDistance distance;
-
-    @BeforeClass
-    public static void setUp() {
-        distance = new LevenshteinDistance();
-    }
+    private static final LevenshteinDistance UNLIMITED_DISTANCE = new LevenshteinDistance();
 
     @Test
     public void testGetLevenshteinDistance_StringString() {
-        assertEquals(0, (int) distance.compare("", ""));
-        assertEquals(1, (int) distance.compare("", "a"));
-        assertEquals(7, (int) distance.compare("aaapppp", ""));
-        assertEquals(1, (int) distance.compare("frog", "fog"));
-        assertEquals(3, (int) distance.compare("fly", "ant"));
-        assertEquals(7, (int) distance.compare("elephant", "hippo"));
-        assertEquals(7, (int) distance.compare("hippo", "elephant"));
-        assertEquals(8, (int) distance.compare("hippo", "zzzzzzzz"));
-        assertEquals(8, (int) distance.compare("zzzzzzzz", "hippo"));
-        assertEquals(1, (int) distance.compare("hello", "hallo"));
+        assertEquals(0, (int) UNLIMITED_DISTANCE.compare("", ""));
+        assertEquals(1, (int) UNLIMITED_DISTANCE.compare("", "a"));
+        assertEquals(7, (int) UNLIMITED_DISTANCE.compare("aaapppp", ""));
+        assertEquals(1, (int) UNLIMITED_DISTANCE.compare("frog", "fog"));
+        assertEquals(3, (int) UNLIMITED_DISTANCE.compare("fly", "ant"));
+        assertEquals(7, (int) UNLIMITED_DISTANCE.compare("elephant", "hippo"));
+        assertEquals(7, (int) UNLIMITED_DISTANCE.compare("hippo", "elephant"));
+        assertEquals(8, (int) UNLIMITED_DISTANCE.compare("hippo", "zzzzzzzz"));
+        assertEquals(8, (int) UNLIMITED_DISTANCE.compare("zzzzzzzz", "hippo"));
+        assertEquals(1, (int) UNLIMITED_DISTANCE.compare("hello", "hallo"));
     }
 
     @Test(expected = IllegalArgumentException.class)
     public void testGetLevenshteinDistance_NullString() throws Exception {
-        distance.compare("a", null);
+        UNLIMITED_DISTANCE.compare("a", null);
     }
 
     @Test(expected = IllegalArgumentException.class)
     public void testGetLevenshteinDistance_StringNull() throws Exception {
-        distance.compare(null, "a");
-    }
-
-    @Test
-    public void testGetLevenshteinDistance_StringStringInt() {
-        // empty strings
-        assertEquals(0, (int) distance.compare("", "", 0));
-        assertEquals(7, (int) distance.compare("aaapppp", "", 8));
-        assertEquals(7, (int) distance.compare("aaapppp", "", 7));
-        assertEquals(-1, (int) distance.compare("aaapppp", "", 6));
-
-        // unequal strings, zero threshold
-        assertEquals(-1, (int) distance.compare("b", "a", 0));
-        assertEquals(-1, (int) distance.compare("a", "b", 0));
-
-        // equal strings
-        assertEquals(0, (int) distance.compare("aa", "aa", 0));
-        assertEquals(0, (int) distance.compare("aa", "aa", 2));
-
-        // same length
-        assertEquals(-1, (int) distance.compare("aaa", "bbb", 2));
-        assertEquals(3, (int) distance.compare("aaa", "bbb", 3));
-
-        // big stripe
-        assertEquals(6, (int) distance.compare("aaaaaa", "b", 10));
-
-        // distance less than threshold
-        assertEquals(7, (int) distance.compare("aaapppp", "b", 8));
-        assertEquals(3, (int) distance.compare("a", "bbb", 4));
-
-        // distance equal to threshold
-        assertEquals(7, (int) distance.compare("aaapppp", "b", 7));
-        assertEquals(3, (int) distance.compare("a", "bbb", 3));
-
-        // distance greater than threshold
-        assertEquals(-1, (int) distance.compare("a", "bbb", 2));
-        assertEquals(-1, (int) distance.compare("bbb", "a", 2));
-        assertEquals(-1, (int) distance.compare("aaapppp", "b", 6));
-
-        // stripe runs off array, strings not similar
-        assertEquals(-1, (int) distance.compare("a", "bbb", 1));
-        assertEquals(-1, (int) distance.compare("bbb", "a", 1));
-
-        // stripe runs off array, strings are similar
-        assertEquals(-1, (int) distance.compare("12345", "1234567", 1));
-        assertEquals(-1, (int) distance.compare("1234567", "12345", 1));
-
-        // old getLevenshteinDistance test cases
-        assertEquals(1, (int) distance.compare("frog", "fog", 1));
-        assertEquals(3, (int) distance.compare("fly", "ant", 3));
-        assertEquals(7, (int) distance.compare("elephant", "hippo", 7));
-        assertEquals(-1, (int) distance.compare("elephant", "hippo", 6));
-        assertEquals(7, (int) distance.compare("hippo", "elephant", 7));
-        assertEquals(-1, (int) distance.compare("hippo", "elephant", 6));
-        assertEquals(8, (int) distance.compare("hippo", "zzzzzzzz", 8));
-        assertEquals(8, (int) distance.compare("zzzzzzzz", "hippo", 8));
-        assertEquals(1, (int) distance.compare("hello", "hallo", 1));
-
-        assertEquals(1,
-                (int) distance.compare("frog", "fog", Integer.MAX_VALUE));
-        assertEquals(3, (int) distance.compare("fly", "ant", Integer.MAX_VALUE));
-        assertEquals(7,
-                (int) distance.compare("elephant", "hippo", Integer.MAX_VALUE));
-        assertEquals(7,
-                (int) distance.compare("hippo", "elephant", Integer.MAX_VALUE));
-        assertEquals(8,
-                (int) distance.compare("hippo", "zzzzzzzz", Integer.MAX_VALUE));
-        assertEquals(8,
-                (int) distance.compare("zzzzzzzz", "hippo", Integer.MAX_VALUE));
-        assertEquals(1,
-                (int) distance.compare("hello", "hallo", Integer.MAX_VALUE));
+        UNLIMITED_DISTANCE.compare(null, "a");
     }
 
     @Test(expected = IllegalArgumentException.class)
     public void testGetLevenshteinDistance_NullStringInt() throws Exception {
-        distance.compare(null, "a", 0);
+        UNLIMITED_DISTANCE.compare(null, "a");
     }
 
     @Test(expected = IllegalArgumentException.class)
     public void testGetLevenshteinDistance_StringNullInt() throws Exception {
-        distance.compare("a", null, 0);
+        UNLIMITED_DISTANCE.compare("a", null);
     }
 
     @Test(expected = IllegalArgumentException.class)
-    public void testGetLevenshteinDistance_StringStringNegativeInt()
-            throws Exception {
-        distance.compare("a", "a", -1);
+    public void testConstructorWithNegativeThreshold() throws Exception {
+        LevenshteinDistance distance = new LevenshteinDistance(-1);
     }
 
 }

http://git-wip-us.apache.org/repos/asf/commons-text/blob/75cdc00a/src/test/java/org/apache/commons/text/similarity/ParameterizedLevenshteinDistanceTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/text/similarity/ParameterizedLevenshteinDistanceTest.java
b/src/test/java/org/apache/commons/text/similarity/ParameterizedLevenshteinDistanceTest.java
new file mode 100644
index 0000000..c6fd116
--- /dev/null
+++ b/src/test/java/org/apache/commons/text/similarity/ParameterizedLevenshteinDistanceTest.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import static org.hamcrest.core.IsEqual.equalTo;
+import static org.junit.Assert.assertThat;
+
+import java.util.Arrays;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+/**
+ * Unit tests for {@link org.apache.commons.text.similarity.LevenshteinDistance}.
+ */
+@RunWith(Parameterized.class)
+public class ParameterizedLevenshteinDistanceTest {
+
+    private final Integer distance;
+    private final CharSequence left;
+    private final CharSequence right;
+    private final Integer threshold;
+
+    public ParameterizedLevenshteinDistanceTest(
+        final Integer threshold,
+        final CharSequence left, final CharSequence right,
+        final Integer distance) {
+
+        this.threshold = threshold;
+        this.left = left;
+        this.right = right;
+        this.distance = distance;
+    }
+
+    @Parameters
+    public static Iterable<Object[]> parameters() {
+        return Arrays.asList( new Object[][] {
+
+            /* empty strings */
+            { 0, "", "", 0 },
+            { 8, "aaapppp", "", 7 },
+            { 7, "aaapppp", "", 7 },
+            { 6, "aaapppp", "", -1 },
+
+            /* unequal strings, zero threshold */
+            { 0, "b", "a", -1 },
+            { 0, "a", "b", -1 },
+
+            /* equal strings */
+            { 0, "aa", "aa", 0 },
+            { 2, "aa", "aa", 0 },
+
+            /* same length */
+            { 2, "aaa", "bbb", -1 },
+            { 3, "aaa", "bbb", 3 },
+
+            /* big stripe */
+            { 10, "aaaaaa", "b", 6 },
+
+            /* distance less than threshold */
+            { 8, "aaapppp", "b", 7 },
+            { 4, "a", "bbb", 3 },
+
+            /* distance equal to threshold */
+            { 7, "aaapppp", "b", 7 },
+            { 3, "a", "bbb", 3 },
+
+            /* distance greater than threshold */
+            { 2, "a", "bbb", -1 },
+            { 2, "bbb", "a", -1 },
+            { 6, "aaapppp", "b", -1 },
+
+            /* stripe runs off array, strings not similar */
+            { 1, "a", "bbb", -1 },
+            { 1, "bbb", "a", -1 },
+
+            /* stripe runs off array, strings are similar */
+            { 1, "12345", "1234567", -1 },
+            { 1, "1234567", "12345", -1 },
+
+           /* old getLevenshteinDistance test cases */
+            { 1, "frog", "fog", 1 },
+            { 3, "fly", "ant", 3 },
+            { 7, "elephant", "hippo", 7 },
+            { 6, "elephant", "hippo", -1 },
+            { 7, "hippo", "elephant", 7 },
+            { 6, "hippo", "elephant", -1 },
+            { 8, "hippo", "zzzzzzzz", 8 },
+            { 8, "zzzzzzzz", "hippo", 8 },
+            { 1, "hello", "hallo", 1 },
+
+            { Integer.MAX_VALUE, "frog", "fog", 1 },
+            { Integer.MAX_VALUE, "fly", "ant", 3 },
+            { Integer.MAX_VALUE, "elephant", "hippo", 7 },
+            { Integer.MAX_VALUE, "hippo", "elephant", 7 },
+            { Integer.MAX_VALUE, "hippo", "zzzzzzzz", 8 },
+            { Integer.MAX_VALUE, "zzzzzzzz", "hippo", 8 },
+            { Integer.MAX_VALUE, "hello", "hallo", 1 }
+
+        } );
+    }
+
+    @Test
+    public void test() {
+        LevenshteinDistance metric = new LevenshteinDistance(threshold);
+        assertThat(metric.compare(left, right), equalTo(distance));
+    }
+
+}


Mime
View raw message