commons-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From chtom...@apache.org
Subject [2/4] [text] TEXT-21: Making JaroWinkler a similarity score as opposed to an edit distance
Date Thu, 17 Nov 2016 12:14:13 GMT
TEXT-21: Making JaroWinkler a similarity score as opposed to an edit distance


Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/7af39fb2
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/7af39fb2
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/7af39fb2

Branch: refs/heads/master
Commit: 7af39fb28d371bf1ca8aff63c5ddb71b9390e0de
Parents: c5f724c
Author: Rob Tompkins <chtompki@gmail.com>
Authored: Wed Nov 16 13:25:30 2016 -0500
Committer: Rob Tompkins <chtompki@gmail.com>
Committed: Wed Nov 16 13:25:30 2016 -0500

----------------------------------------------------------------------
 .../text/similarity/JaroWinklerDistance.java    | 367 +++++++++++++++++++
 .../text/similarity/JaroWrinklerDistance.java   | 367 -------------------
 .../text/similarity/SimilarityScore.java        |   1 +
 .../commons/text/similarity/package-info.java   |   2 +-
 .../similarity/JaroWinklerDistanceTest.java     |  62 ++++
 .../similarity/JaroWrinklerDistanceTest.java    |  62 ----
 .../ParameterizedEditDistanceFromTest.java      |   6 +-
 7 files changed, 434 insertions(+), 433 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-text/blob/7af39fb2/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java b/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
new file mode 100644
index 0000000..04f7ddf
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
@@ -0,0 +1,367 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+/**
+ * A similarity algorithm indicating the percentage of matched characters between two character
sequences.
+ *
+ * <p>
+ * The Jaro measure is the weighted sum of percentage of matched characters
+ * from each file and transposed characters. Winkler increased this measure
+ * for matching initial characters.
+ * </p>
+ *
+ * <p>
+ * This implementation is based on the Jaro Winkler similarity algorithm
+ * from <a href="http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance">
+ * http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance</a>.
+ * </p>
+ *
+ * <p>
+ * This code has been adapted from Apache Commons Lang 3.3.
+ * </p>
+ */
+public class JaroWinklerDistance implements SimilarityScore<Double> {
+
+    /**
+     * The default prefix length limit set to four.
+     */
+    private static final int PREFIX_LENGTH_LIMIT = 4;
+    /**
+     * Represents a failed index search.
+     */
+    public static final int INDEX_NOT_FOUND = -1;
+
+    /**
+     * Find the Jaro Winkler Distance which indicates the similarity score
+     * between two CharSequences.
+     *
+     * <pre>
+     * distance.apply(null, null)          = IllegalArgumentException
+     * distance.apply("","")               = 0.0
+     * distance.apply("","a")              = 0.0
+     * distance.apply("aaapppp", "")       = 0.0
+     * distance.apply("frog", "fog")       = 0.93
+     * distance.apply("fly", "ant")        = 0.0
+     * distance.apply("elephant", "hippo") = 0.44
+     * distance.apply("hippo", "elephant") = 0.44
+     * distance.apply("hippo", "zzzzzzzz") = 0.0
+     * distance.apply("hello", "hallo")    = 0.88
+     * distance.apply("ABC Corporation", "ABC Corp") = 0.91
+     * distance.apply("D N H Enterprises Inc", "D &amp; H Enterprises, Inc.") = 0.93
+     * distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") =
0.94
+     * distance.apply("PENNSYLVANIA", "PENNCISYLVNIA")    = 0.9
+     * </pre>
+     *
+     * @param left the first String, must not be null
+     * @param right the second String, must not be null
+     * @return result distance
+     * @throws IllegalArgumentException if either String input {@code null}
+     */
+    @Override
+    public Double apply(CharSequence left, CharSequence right) {
+        final double defaultScalingFactor = 0.1;
+        final double percentageRoundValue = 100.0;
+
+        if (left == null || right == null) {
+            throw new IllegalArgumentException("Strings must not be null");
+        }
+
+        final double jaro = score(left, right);
+        final int cl = commonPrefixLength(left, right);
+        final double matchScore = Math.round((jaro + defaultScalingFactor
+                * cl * (1.0 - jaro)) * percentageRoundValue) / percentageRoundValue;
+
+        return matchScore;
+    }
+
+    /**
+     * Calculates the number of characters from the beginning of the strings
+     * that match exactly one-to-one, up to a maximum of four (4) characters.
+     *
+     * @param first The first string.
+     * @param second The second string.
+     * @return A number between 0 and 4.
+     */
+    private static int commonPrefixLength(final CharSequence first,
+            final CharSequence second) {
+        final int result = getCommonPrefix(first.toString(), second.toString())
+                .length();
+
+        // Limit the result to 4.
+        return result > PREFIX_LENGTH_LIMIT ? PREFIX_LENGTH_LIMIT : result;
+    }
+
+    /**
+     * Compares all Strings in an array and returns the initial sequence of
+     * characters that is common to all of them.
+     *
+     * <p>
+     * For example,
+     * <code>getCommonPrefix(new String[] {"i am a machine", "i am a robot"}) -&gt;
"i am a "</code>
+     * </p>
+     *
+     * <pre>
+     * getCommonPrefix(null) = ""
+     * getCommonPrefix(new String[] {}) = ""
+     * getCommonPrefix(new String[] {"abc"}) = "abc"
+     * getCommonPrefix(new String[] {null, null}) = ""
+     * getCommonPrefix(new String[] {"", ""}) = ""
+     * getCommonPrefix(new String[] {"", null}) = ""
+     * getCommonPrefix(new String[] {"abc", null, null}) = ""
+     * getCommonPrefix(new String[] {null, null, "abc"}) = ""
+     * getCommonPrefix(new String[] {"", "abc"}) = ""
+     * getCommonPrefix(new String[] {"abc", ""}) = ""
+     * getCommonPrefix(new String[] {"abc", "abc"}) = "abc"
+     * getCommonPrefix(new String[] {"abc", "a"}) = "a"
+     * getCommonPrefix(new String[] {"ab", "abxyz"}) = "ab"
+     * getCommonPrefix(new String[] {"abcde", "abxyz"}) = "ab"
+     * getCommonPrefix(new String[] {"abcde", "xyz"}) = ""
+     * getCommonPrefix(new String[] {"xyz", "abcde"}) = ""
+     * getCommonPrefix(new String[] {"i am a machine", "i am a robot"}) = "i am a "
+     * </pre>
+     *
+     * @param strs array of String objects, entries may be null
+     * @return the initial sequence of characters that are common to all Strings
+     *         in the array; empty String if the array is null, the elements are
+     *         all null or if there is no common prefix.
+     */
+    public static String getCommonPrefix(final String... strs) {
+        if (strs == null || strs.length == 0) {
+            return "";
+        }
+        final int smallestIndexOfDiff = indexOfDifference(strs);
+        if (smallestIndexOfDiff == INDEX_NOT_FOUND) {
+            // all strings were identical
+            if (strs[0] == null) {
+                return "";
+            }
+            return strs[0];
+        } else if (smallestIndexOfDiff == 0) {
+            // there were no common initial characters
+            return "";
+        } else {
+            // we found a common initial character sequence
+            return strs[0].substring(0, smallestIndexOfDiff);
+        }
+    }
+
+    /**
+     * This method returns the Jaro-Winkler score for string matching.
+     *
+     * @param first the first string to be matched
+     * @param second the second string to be machted
+     * @return matching score without scaling factor impact
+     */
+    protected static double score(final CharSequence first,
+            final CharSequence second) {
+        String shorter;
+        String longer;
+
+        // Determine which String is longer.
+        if (first.length() > second.length()) {
+            longer = first.toString().toLowerCase();
+            shorter = second.toString().toLowerCase();
+        } else {
+            longer = second.toString().toLowerCase();
+            shorter = first.toString().toLowerCase();
+        }
+
+        // Calculate the half length() distance of the shorter String.
+        final int halflength = shorter.length() / 2 + 1;
+
+        // Find the set of matching characters between the shorter and longer
+        // strings. Note that
+        // the set of matching characters may be different depending on the
+        // order of the strings.
+        final String m1 = getSetOfMatchingCharacterWithin(shorter, longer,
+                halflength);
+        final String m2 = getSetOfMatchingCharacterWithin(longer, shorter,
+                halflength);
+
+        // If one or both of the sets of common characters is empty, then
+        // there is no similarity between the two strings.
+        if (m1.length() == 0 || m2.length() == 0) {
+            return 0.0;
+        }
+
+        // If the set of common characters is not the same size, then
+        // there is no similarity between the two strings, either.
+        if (m1.length() != m2.length()) {
+            return 0.0;
+        }
+
+        // Calculate the number of transposition between the two sets
+        // of common characters.
+        final int transpositions = transpositions(m1, m2);
+
+        final double defaultDenominator = 3.0;
+
+        // Calculate the distance.
+        final double dist = (m1.length() / ((double) shorter.length())
+                + m2.length() / ((double) longer.length()) + (m1.length() - transpositions)
+                / ((double) m1.length())) / defaultDenominator;
+        return dist;
+    }
+
+    /**
+     * Calculates the number of transposition between two strings.
+     *
+     * @param first The first string.
+     * @param second The second string.
+     * @return The number of transposition between the two strings.
+     */
+    protected static int transpositions(final CharSequence first,
+            final CharSequence second) {
+        int transpositions = 0;
+        for (int i = 0; i < first.length(); i++) {
+            if (first.charAt(i) != second.charAt(i)) {
+                transpositions++;
+            }
+        }
+        return transpositions / 2;
+    }
+
+    /**
+     * Compares all CharSequences in an array and returns the index at which the
+     * CharSequences begin to differ.
+     *
+     * <p>
+     * For example,
+     * <code>indexOfDifference(new String[] {"i am a machine", "i am a robot"}) -&gt;
7</code>
+     * </p>
+     *
+     * <pre>
+     * distance.indexOfDifference(null) = -1
+     * distance.indexOfDifference(new String[] {}) = -1
+     * distance.indexOfDifference(new String[] {"abc"}) = -1
+     * distance.indexOfDifference(new String[] {null, null}) = -1
+     * distance.indexOfDifference(new String[] {"", ""}) = -1
+     * distance.indexOfDifference(new String[] {"", null}) = 0
+     * distance.indexOfDifference(new String[] {"abc", null, null}) = 0
+     * distance.indexOfDifference(new String[] {null, null, "abc"}) = 0
+     * distance.indexOfDifference(new String[] {"", "abc"}) = 0
+     * distance.indexOfDifference(new String[] {"abc", ""}) = 0
+     * distance.indexOfDifference(new String[] {"abc", "abc"}) = -1
+     * distance.indexOfDifference(new String[] {"abc", "a"}) = 1
+     * distance.indexOfDifference(new String[] {"ab", "abxyz"}) = 2
+     * distance.indexOfDifference(new String[] {"abcde", "abxyz"}) = 2
+     * distance.indexOfDifference(new String[] {"abcde", "xyz"}) = 0
+     * distance.indexOfDifference(new String[] {"xyz", "abcde"}) = 0
+     * distance.indexOfDifference(new String[] {"i am a machine", "i am a robot"}) = 7
+     * </pre>
+     *
+     * @param css array of CharSequences, entries may be null
+     * @return the index where the strings begin to differ; -1 if they are all
+     *         equal
+     */
+    protected static int indexOfDifference(final CharSequence... css) {
+        if (css == null || css.length <= 1) {
+            return INDEX_NOT_FOUND;
+        }
+        boolean anyStringNull = false;
+        boolean allStringsNull = true;
+        final int arrayLen = css.length;
+        int shortestStrLen = Integer.MAX_VALUE;
+        int longestStrLen = 0;
+
+        // find the min and max string lengths; this avoids checking to make
+        // sure we are not exceeding the length of the string each time through
+        // the bottom loop.
+        for (int i = 0; i < arrayLen; i++) {
+            if (css[i] == null) {
+                anyStringNull = true;
+                shortestStrLen = 0;
+            } else {
+                allStringsNull = false;
+                shortestStrLen = Math.min(css[i].length(), shortestStrLen);
+                longestStrLen = Math.max(css[i].length(), longestStrLen);
+            }
+        }
+
+        // handle lists containing all nulls or all empty strings
+        if (allStringsNull || longestStrLen == 0 && !anyStringNull) {
+            return INDEX_NOT_FOUND;
+        }
+
+        // handle lists containing some nulls or some empty strings
+        if (shortestStrLen == 0) {
+            return 0;
+        }
+
+        // find the position with the first difference across all strings
+        int firstDiff = -1;
+        for (int stringPos = 0; stringPos < shortestStrLen; stringPos++) {
+            final char comparisonChar = css[0].charAt(stringPos);
+            for (int arrayPos = 1; arrayPos < arrayLen; arrayPos++) {
+                if (css[arrayPos].charAt(stringPos) != comparisonChar) {
+                    firstDiff = stringPos;
+                    break;
+                }
+            }
+            if (firstDiff != -1) {
+                break;
+            }
+        }
+
+        if (firstDiff == -1 && shortestStrLen != longestStrLen) {
+            // we compared all of the characters up to the length of the
+            // shortest string and didn't find a match, but the string lengths
+            // vary, so return the length of the shortest string.
+            return shortestStrLen;
+        }
+        return firstDiff;
+    }
+
+    /**
+     * Gets a set of matching characters between two strings.
+     *
+     * <p>
+     * Two characters from the first string and the second string are
+     * considered matching if the character's respective positions are no
+     * farther than the limit value.
+     * </p>
+     *
+     * @param first The first string.
+     * @param second The second string.
+     * @param limit The maximum distance to consider.
+     * @return A string contain the set of common characters.
+     */
+    protected static String getSetOfMatchingCharacterWithin(
+            final CharSequence first, final CharSequence second, final int limit) {
+        final StringBuilder common = new StringBuilder();
+        final StringBuilder copy = new StringBuilder(second);
+
+        for (int i = 0; i < first.length(); i++) {
+            final char ch = first.charAt(i);
+            boolean found = false;
+
+            // See if the character is within the limit positions away from the
+            // original position of that character.
+            for (int j = Math.max(0, i - limit); !found
+                    && j < Math.min(i + limit, second.length()); j++) {
+                if (copy.charAt(j) == ch) {
+                    found = true;
+                    common.append(ch);
+                    copy.setCharAt(j, '*');
+                }
+            }
+        }
+        return common.toString();
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/7af39fb2/src/main/java/org/apache/commons/text/similarity/JaroWrinklerDistance.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/JaroWrinklerDistance.java b/src/main/java/org/apache/commons/text/similarity/JaroWrinklerDistance.java
deleted file mode 100644
index 697e995..0000000
--- a/src/main/java/org/apache/commons/text/similarity/JaroWrinklerDistance.java
+++ /dev/null
@@ -1,367 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.text.similarity;
-
-/**
- * A similarity algorithm indicating the percentage of matched characters between two character
sequences.
- *
- * <p>
- * The Jaro measure is the weighted sum of percentage of matched characters
- * from each file and transposed characters. Winkler increased this measure
- * for matching initial characters.
- * </p>
- *
- * <p>
- * This implementation is based on the Jaro Winkler similarity algorithm
- * from <a href="http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance">
- * http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance</a>.
- * </p>
- *
- * <p>
- * This code has been adapted from Apache Commons Lang 3.3.
- * </p>
- */
-public class JaroWrinklerDistance implements EditDistance<Double> {
-
-    /**
-     * The default prefix length limit set to four.
-     */
-    private static final int PREFIX_LENGTH_LIMIT = 4;
-    /**
-     * Represents a failed index search.
-     */
-    public static final int INDEX_NOT_FOUND = -1;
-
-    /**
-     * Find the Jaro Winkler Distance which indicates the similarity score
-     * between two CharSequences.
-     *
-     * <pre>
-     * distance.apply(null, null)          = IllegalArgumentException
-     * distance.apply("","")               = 0.0
-     * distance.apply("","a")              = 0.0
-     * distance.apply("aaapppp", "")       = 0.0
-     * distance.apply("frog", "fog")       = 0.93
-     * distance.apply("fly", "ant")        = 0.0
-     * distance.apply("elephant", "hippo") = 0.44
-     * distance.apply("hippo", "elephant") = 0.44
-     * distance.apply("hippo", "zzzzzzzz") = 0.0
-     * distance.apply("hello", "hallo")    = 0.88
-     * distance.apply("ABC Corporation", "ABC Corp") = 0.91
-     * distance.apply("D N H Enterprises Inc", "D &amp; H Enterprises, Inc.") = 0.93
-     * distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") =
0.94
-     * distance.apply("PENNSYLVANIA", "PENNCISYLVNIA")    = 0.9
-     * </pre>
-     *
-     * @param left the first String, must not be null
-     * @param right the second String, must not be null
-     * @return result distance
-     * @throws IllegalArgumentException if either String input {@code null}
-     */
-    @Override
-    public Double apply(CharSequence left, CharSequence right) {
-        final double defaultScalingFactor = 0.1;
-        final double percentageRoundValue = 100.0;
-
-        if (left == null || right == null) {
-            throw new IllegalArgumentException("Strings must not be null");
-        }
-
-        final double jaro = score(left, right);
-        final int cl = commonPrefixLength(left, right);
-        final double matchScore = Math.round((jaro + defaultScalingFactor
-                * cl * (1.0 - jaro)) * percentageRoundValue) / percentageRoundValue;
-
-        return matchScore;
-    }
-
-    /**
-     * Calculates the number of characters from the beginning of the strings
-     * that match exactly one-to-one, up to a maximum of four (4) characters.
-     *
-     * @param first The first string.
-     * @param second The second string.
-     * @return A number between 0 and 4.
-     */
-    private static int commonPrefixLength(final CharSequence first,
-            final CharSequence second) {
-        final int result = getCommonPrefix(first.toString(), second.toString())
-                .length();
-
-        // Limit the result to 4.
-        return result > PREFIX_LENGTH_LIMIT ? PREFIX_LENGTH_LIMIT : result;
-    }
-
-    /**
-     * Compares all Strings in an array and returns the initial sequence of
-     * characters that is common to all of them.
-     *
-     * <p>
-     * For example,
-     * <code>getCommonPrefix(new String[] {"i am a machine", "i am a robot"}) -&gt;
"i am a "</code>
-     * </p>
-     *
-     * <pre>
-     * getCommonPrefix(null) = ""
-     * getCommonPrefix(new String[] {}) = ""
-     * getCommonPrefix(new String[] {"abc"}) = "abc"
-     * getCommonPrefix(new String[] {null, null}) = ""
-     * getCommonPrefix(new String[] {"", ""}) = ""
-     * getCommonPrefix(new String[] {"", null}) = ""
-     * getCommonPrefix(new String[] {"abc", null, null}) = ""
-     * getCommonPrefix(new String[] {null, null, "abc"}) = ""
-     * getCommonPrefix(new String[] {"", "abc"}) = ""
-     * getCommonPrefix(new String[] {"abc", ""}) = ""
-     * getCommonPrefix(new String[] {"abc", "abc"}) = "abc"
-     * getCommonPrefix(new String[] {"abc", "a"}) = "a"
-     * getCommonPrefix(new String[] {"ab", "abxyz"}) = "ab"
-     * getCommonPrefix(new String[] {"abcde", "abxyz"}) = "ab"
-     * getCommonPrefix(new String[] {"abcde", "xyz"}) = ""
-     * getCommonPrefix(new String[] {"xyz", "abcde"}) = ""
-     * getCommonPrefix(new String[] {"i am a machine", "i am a robot"}) = "i am a "
-     * </pre>
-     *
-     * @param strs array of String objects, entries may be null
-     * @return the initial sequence of characters that are common to all Strings
-     *         in the array; empty String if the array is null, the elements are
-     *         all null or if there is no common prefix.
-     */
-    public static String getCommonPrefix(final String... strs) {
-        if (strs == null || strs.length == 0) {
-            return "";
-        }
-        final int smallestIndexOfDiff = indexOfDifference(strs);
-        if (smallestIndexOfDiff == INDEX_NOT_FOUND) {
-            // all strings were identical
-            if (strs[0] == null) {
-                return "";
-            }
-            return strs[0];
-        } else if (smallestIndexOfDiff == 0) {
-            // there were no common initial characters
-            return "";
-        } else {
-            // we found a common initial character sequence
-            return strs[0].substring(0, smallestIndexOfDiff);
-        }
-    }
-
-    /**
-     * This method returns the Jaro-Winkler score for string matching.
-     *
-     * @param first the first string to be matched
-     * @param second the second string to be machted
-     * @return matching score without scaling factor impact
-     */
-    protected static double score(final CharSequence first,
-            final CharSequence second) {
-        String shorter;
-        String longer;
-
-        // Determine which String is longer.
-        if (first.length() > second.length()) {
-            longer = first.toString().toLowerCase();
-            shorter = second.toString().toLowerCase();
-        } else {
-            longer = second.toString().toLowerCase();
-            shorter = first.toString().toLowerCase();
-        }
-
-        // Calculate the half length() distance of the shorter String.
-        final int halflength = shorter.length() / 2 + 1;
-
-        // Find the set of matching characters between the shorter and longer
-        // strings. Note that
-        // the set of matching characters may be different depending on the
-        // order of the strings.
-        final String m1 = getSetOfMatchingCharacterWithin(shorter, longer,
-                halflength);
-        final String m2 = getSetOfMatchingCharacterWithin(longer, shorter,
-                halflength);
-
-        // If one or both of the sets of common characters is empty, then
-        // there is no similarity between the two strings.
-        if (m1.length() == 0 || m2.length() == 0) {
-            return 0.0;
-        }
-
-        // If the set of common characters is not the same size, then
-        // there is no similarity between the two strings, either.
-        if (m1.length() != m2.length()) {
-            return 0.0;
-        }
-
-        // Calculate the number of transposition between the two sets
-        // of common characters.
-        final int transpositions = transpositions(m1, m2);
-
-        final double defaultDenominator = 3.0;
-
-        // Calculate the distance.
-        final double dist = (m1.length() / ((double) shorter.length())
-                + m2.length() / ((double) longer.length()) + (m1.length() - transpositions)
-                / ((double) m1.length())) / defaultDenominator;
-        return dist;
-    }
-
-    /**
-     * Calculates the number of transposition between two strings.
-     *
-     * @param first The first string.
-     * @param second The second string.
-     * @return The number of transposition between the two strings.
-     */
-    protected static int transpositions(final CharSequence first,
-            final CharSequence second) {
-        int transpositions = 0;
-        for (int i = 0; i < first.length(); i++) {
-            if (first.charAt(i) != second.charAt(i)) {
-                transpositions++;
-            }
-        }
-        return transpositions / 2;
-    }
-
-    /**
-     * Compares all CharSequences in an array and returns the index at which the
-     * CharSequences begin to differ.
-     *
-     * <p>
-     * For example,
-     * <code>indexOfDifference(new String[] {"i am a machine", "i am a robot"}) -&gt;
7</code>
-     * </p>
-     *
-     * <pre>
-     * distance.indexOfDifference(null) = -1
-     * distance.indexOfDifference(new String[] {}) = -1
-     * distance.indexOfDifference(new String[] {"abc"}) = -1
-     * distance.indexOfDifference(new String[] {null, null}) = -1
-     * distance.indexOfDifference(new String[] {"", ""}) = -1
-     * distance.indexOfDifference(new String[] {"", null}) = 0
-     * distance.indexOfDifference(new String[] {"abc", null, null}) = 0
-     * distance.indexOfDifference(new String[] {null, null, "abc"}) = 0
-     * distance.indexOfDifference(new String[] {"", "abc"}) = 0
-     * distance.indexOfDifference(new String[] {"abc", ""}) = 0
-     * distance.indexOfDifference(new String[] {"abc", "abc"}) = -1
-     * distance.indexOfDifference(new String[] {"abc", "a"}) = 1
-     * distance.indexOfDifference(new String[] {"ab", "abxyz"}) = 2
-     * distance.indexOfDifference(new String[] {"abcde", "abxyz"}) = 2
-     * distance.indexOfDifference(new String[] {"abcde", "xyz"}) = 0
-     * distance.indexOfDifference(new String[] {"xyz", "abcde"}) = 0
-     * distance.indexOfDifference(new String[] {"i am a machine", "i am a robot"}) = 7
-     * </pre>
-     *
-     * @param css array of CharSequences, entries may be null
-     * @return the index where the strings begin to differ; -1 if they are all
-     *         equal
-     */
-    protected static int indexOfDifference(final CharSequence... css) {
-        if (css == null || css.length <= 1) {
-            return INDEX_NOT_FOUND;
-        }
-        boolean anyStringNull = false;
-        boolean allStringsNull = true;
-        final int arrayLen = css.length;
-        int shortestStrLen = Integer.MAX_VALUE;
-        int longestStrLen = 0;
-
-        // find the min and max string lengths; this avoids checking to make
-        // sure we are not exceeding the length of the string each time through
-        // the bottom loop.
-        for (int i = 0; i < arrayLen; i++) {
-            if (css[i] == null) {
-                anyStringNull = true;
-                shortestStrLen = 0;
-            } else {
-                allStringsNull = false;
-                shortestStrLen = Math.min(css[i].length(), shortestStrLen);
-                longestStrLen = Math.max(css[i].length(), longestStrLen);
-            }
-        }
-
-        // handle lists containing all nulls or all empty strings
-        if (allStringsNull || longestStrLen == 0 && !anyStringNull) {
-            return INDEX_NOT_FOUND;
-        }
-
-        // handle lists containing some nulls or some empty strings
-        if (shortestStrLen == 0) {
-            return 0;
-        }
-
-        // find the position with the first difference across all strings
-        int firstDiff = -1;
-        for (int stringPos = 0; stringPos < shortestStrLen; stringPos++) {
-            final char comparisonChar = css[0].charAt(stringPos);
-            for (int arrayPos = 1; arrayPos < arrayLen; arrayPos++) {
-                if (css[arrayPos].charAt(stringPos) != comparisonChar) {
-                    firstDiff = stringPos;
-                    break;
-                }
-            }
-            if (firstDiff != -1) {
-                break;
-            }
-        }
-
-        if (firstDiff == -1 && shortestStrLen != longestStrLen) {
-            // we compared all of the characters up to the length of the
-            // shortest string and didn't find a match, but the string lengths
-            // vary, so return the length of the shortest string.
-            return shortestStrLen;
-        }
-        return firstDiff;
-    }
-
-    /**
-     * Gets a set of matching characters between two strings.
-     *
-     * <p>
-     * Two characters from the first string and the second string are
-     * considered matching if the character's respective positions are no
-     * farther than the limit value.
-     * </p>
-     *
-     * @param first The first string.
-     * @param second The second string.
-     * @param limit The maximum distance to consider.
-     * @return A string contain the set of common characters.
-     */
-    protected static String getSetOfMatchingCharacterWithin(
-            final CharSequence first, final CharSequence second, final int limit) {
-        final StringBuilder common = new StringBuilder();
-        final StringBuilder copy = new StringBuilder(second);
-
-        for (int i = 0; i < first.length(); i++) {
-            final char ch = first.charAt(i);
-            boolean found = false;
-
-            // See if the character is within the limit positions away from the
-            // original position of that character.
-            for (int j = Math.max(0, i - limit); !found
-                    && j < Math.min(i + limit, second.length()); j++) {
-                if (copy.charAt(j) == ch) {
-                    found = true;
-                    common.append(ch);
-                    copy.setCharAt(j, '*');
-                }
-            }
-        }
-        return common.toString();
-    }
-
-}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/7af39fb2/src/main/java/org/apache/commons/text/similarity/SimilarityScore.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/SimilarityScore.java b/src/main/java/org/apache/commons/text/similarity/SimilarityScore.java
index c19645c..a1f8dff 100644
--- a/src/main/java/org/apache/commons/text/similarity/SimilarityScore.java
+++ b/src/main/java/org/apache/commons/text/similarity/SimilarityScore.java
@@ -15,6 +15,7 @@ package org.apache.commons.text.similarity;
  *     <li><code>d(x,y) &gt;= 0</code>, non-negativity or separation
axiom</li>
  *     <li><code>d(x,y) == d(y,x)</code>, symmetry.</li>
  * </ul>
+ *
  * <p>
  * Notice, these are two of the properties that contribute to d being a metric.
  * </p>

http://git-wip-us.apache.org/repos/asf/commons-text/blob/7af39fb2/src/main/java/org/apache/commons/text/similarity/package-info.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/package-info.java b/src/main/java/org/apache/commons/text/similarity/package-info.java
index d2a19a6..6512f38 100644
--- a/src/main/java/org/apache/commons/text/similarity/package-info.java
+++ b/src/main/java/org/apache/commons/text/similarity/package-info.java
@@ -28,7 +28,7 @@
  * <li>{@link org.apache.commons.text.similarity.CosineSimilarity Cosine Similarity}</li>
  * <li>{@link org.apache.commons.text.similarity.FuzzyScore Fuzzy Score}</li>
  * <li>{@link org.apache.commons.text.similarity.HammingDistance Hamming Distance}</li>
- * <li>{@link org.apache.commons.text.similarity.JaroWrinklerDistance Jaro-Wrinkler
Distance}</li>
+ * <li>{@link org.apache.commons.text.similarity.JaroWinklerDistance Jaro-Wrinkler
Distance}</li>
  * <li>{@link org.apache.commons.text.similarity.LevenshteinDistance Levenshtein Distance}</li>
  * </ul>
  *

http://git-wip-us.apache.org/repos/asf/commons-text/blob/7af39fb2/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java
b/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java
new file mode 100644
index 0000000..7548e6e
--- /dev/null
+++ b/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * Unit tests for {@link JaroWinklerDistance}.
+ */
+public class JaroWinklerDistanceTest {
+
+    private static JaroWinklerDistance distance;
+    
+    @BeforeClass
+    public static void setUp() {
+        distance = new JaroWinklerDistance();
+    }
+    
+    @Test
+    public void testGetJaroWinklerDistance_StringString() {
+        assertEquals(0.93d, (double) distance.apply("frog", "fog"), 0.0d);
+        assertEquals(0.0d, (double) distance.apply("fly", "ant"), 0.0d);
+        assertEquals(0.44d, (double) distance.apply("elephant", "hippo"), 0.0d);
+        assertEquals(0.91d, (double) distance.apply("ABC Corporation", "ABC Corp"), 0.0d);
+        assertEquals(0.93d, (double) distance.apply("D N H Enterprises Inc", "D & H Enterprises,
Inc."), 0.0d);
+        assertEquals(0.94d, (double) distance.apply("My Gym Children's Fitness Center", "My
Gym. Childrens Fitness"), 0.0d);
+        assertEquals(0.9d, (double) distance.apply("PENNSYLVANIA", "PENNCISYLVNIA"), 0.0d);
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testGetJaroWinklerDistance_NullNull() throws Exception {
+        distance.apply(null, null);
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testGetJaroWinklerDistance_StringNull() throws Exception {
+        distance.apply(" ", null);
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testGetJaroWinklerDistance_NullString() throws Exception {
+        distance.apply(null, "clear");
+    }
+    
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/7af39fb2/src/test/java/org/apache/commons/text/similarity/JaroWrinklerDistanceTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/text/similarity/JaroWrinklerDistanceTest.java
b/src/test/java/org/apache/commons/text/similarity/JaroWrinklerDistanceTest.java
deleted file mode 100644
index 660cca5..0000000
--- a/src/test/java/org/apache/commons/text/similarity/JaroWrinklerDistanceTest.java
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- * 
- *      http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.text.similarity;
-
-import static org.junit.Assert.assertEquals;
-
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-/**
- * Unit tests for {@link org.apache.commons.text.similarity.JaroWrinklerDistance}.
- */
-public class JaroWrinklerDistanceTest {
-
-    private static JaroWrinklerDistance distance;
-    
-    @BeforeClass
-    public static void setUp() {
-        distance = new JaroWrinklerDistance();
-    }
-    
-    @Test
-    public void testGetJaroWinklerDistance_StringString() {
-        assertEquals(0.93d, (double) distance.apply("frog", "fog"), 0.0d);
-        assertEquals(0.0d, (double) distance.apply("fly", "ant"), 0.0d);
-        assertEquals(0.44d, (double) distance.apply("elephant", "hippo"), 0.0d);
-        assertEquals(0.91d, (double) distance.apply("ABC Corporation", "ABC Corp"), 0.0d);
-        assertEquals(0.93d, (double) distance.apply("D N H Enterprises Inc", "D & H Enterprises,
Inc."), 0.0d);
-        assertEquals(0.94d, (double) distance.apply("My Gym Children's Fitness Center", "My
Gym. Childrens Fitness"), 0.0d);
-        assertEquals(0.9d, (double) distance.apply("PENNSYLVANIA", "PENNCISYLVNIA"), 0.0d);
-    }
-
-    @Test(expected = IllegalArgumentException.class)
-    public void testGetJaroWinklerDistance_NullNull() throws Exception {
-        distance.apply(null, null);
-    }
-
-    @Test(expected = IllegalArgumentException.class)
-    public void testGetJaroWinklerDistance_StringNull() throws Exception {
-        distance.apply(" ", null);
-    }
-
-    @Test(expected = IllegalArgumentException.class)
-    public void testGetJaroWinklerDistance_NullString() throws Exception {
-        distance.apply(null, "clear");
-    }
-    
-}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/7af39fb2/src/test/java/org/apache/commons/text/similarity/ParameterizedEditDistanceFromTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/text/similarity/ParameterizedEditDistanceFromTest.java
b/src/test/java/org/apache/commons/text/similarity/ParameterizedEditDistanceFromTest.java
index f95ca1a..1068749 100644
--- a/src/test/java/org/apache/commons/text/similarity/ParameterizedEditDistanceFromTest.java
+++ b/src/test/java/org/apache/commons/text/similarity/ParameterizedEditDistanceFromTest.java
@@ -58,9 +58,9 @@ public class ParameterizedEditDistanceFromTest<R> {
             { new HammingDistance(), "Japtheth, Ham, Shem", "Japtheth, HAM, Shem", 2 },
             { new HammingDistance(), "Hamming", "Hamming", 0 },
 
-            { new JaroWrinklerDistance(), "elephant", "hippo", 0.44 },
-            { new JaroWrinklerDistance(), "hippo", "elephant",  0.44 },
-            { new JaroWrinklerDistance(), "hippo", "zzzzzzzz", 0.0 },
+            { new JaroWinklerDistance(), "elephant", "hippo", 0.44 },
+            { new JaroWinklerDistance(), "hippo", "elephant",  0.44 },
+            { new JaroWinklerDistance(), "hippo", "zzzzzzzz", 0.0 },
 
             { new LevenshteinDistance(), "Apache", "a patchy", 4 },
             { new LevenshteinDistance(), "go", "no go", 3 },


Mime
View raw message