Return-Path: X-Original-To: apmail-commons-notifications-archive@minotaur.apache.org Delivered-To: apmail-commons-notifications-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id CF9361014E for ; Fri, 17 Apr 2015 07:26:54 +0000 (UTC) Received: (qmail 86666 invoked by uid 500); 17 Apr 2015 07:11:50 -0000 Delivered-To: apmail-commons-notifications-archive@commons.apache.org Received: (qmail 73969 invoked by uid 500); 17 Apr 2015 07:11:43 -0000 Mailing-List: contact notifications-help@commons.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@commons.apache.org Delivered-To: mailing list notifications@commons.apache.org Received: (qmail 59495 invoked by uid 99); 17 Apr 2015 06:46:37 -0000 Received: from eris.apache.org (HELO hades.apache.org) (140.211.11.105) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 17 Apr 2015 06:46:37 +0000 Received: from hades.apache.org (localhost [127.0.0.1]) by hades.apache.org (ASF Mail Server at hades.apache.org) with ESMTP id 280AFAC0AE6 for ; Fri, 17 Apr 2015 06:46:37 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r947984 [25/36] - in /websites/production/commons/content/sandbox/commons-text: ./ apidocs/ apidocs/org/apache/commons/text/diff/ apidocs/org/apache/commons/text/diff/class-use/ apidocs/org/apache/commons/text/names/ apidocs/org/apache/comm... Date: Fri, 17 Apr 2015 06:46:32 -0000 To: notifications@commons.apache.org From: kinow@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20150417064637.280AFAC0AE6@hades.apache.org> Modified: websites/production/commons/content/sandbox/commons-text/jacoco/org.apache.commons.text.similarity/JaroWrinklerDistance.java.html ============================================================================== --- websites/production/commons/content/sandbox/commons-text/jacoco/org.apache.commons.text.similarity/JaroWrinklerDistance.java.html (original) +++ websites/production/commons/content/sandbox/commons-text/jacoco/org.apache.commons.text.similarity/JaroWrinklerDistance.java.html Fri Apr 17 06:46:28 2015 @@ -34,10 +34,16 @@ package org.apache.commons.text.similari * <p> * This code has been adapted from Apache Commons Lang 3.3. * </p> + * + * @since 1.0 */ -public class JaroWrinklerDistance implements StringMetric<Double> { +public class JaroWrinklerDistance implements EditDistance<Double> { /** + * The default prefix length limit set to four. + */ + private static final int PREFIX_LENGTH_LIMIT = 4; + /** * Represents a failed index search. */ public static final int INDEX_NOT_FOUND = -1; @@ -47,20 +53,20 @@ package org.apache.commons.text.similari * between two CharSequences. * * <pre> - * distance.compare(null, null) = IllegalArgumentException - * distance.compare("","") = 0.0 - * distance.compare("","a") = 0.0 - * distance.compare("aaapppp", "") = 0.0 - * distance.compare("frog", "fog") = 0.93 - * distance.compare("fly", "ant") = 0.0 - * distance.compare("elephant", "hippo") = 0.44 - * distance.compare("hippo", "elephant") = 0.44 - * distance.compare("hippo", "zzzzzzzz") = 0.0 - * distance.compare("hello", "hallo") = 0.88 - * distance.compare("ABC Corporation", "ABC Corp") = 0.91 - * distance.compare("D N H Enterprises Inc", "D &amp; H Enterprises, Inc.") = 0.93 - * distance.compare("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.94 - * distance.compare("PENNSYLVANIA", "PENNCISYLVNIA") = 0.9 + * distance.apply(null, null) = IllegalArgumentException + * distance.apply("","") = 0.0 + * distance.apply("","a") = 0.0 + * distance.apply("aaapppp", "") = 0.0 + * distance.apply("frog", "fog") = 0.93 + * distance.apply("fly", "ant") = 0.0 + * distance.apply("elephant", "hippo") = 0.44 + * distance.apply("hippo", "elephant") = 0.44 + * distance.apply("hippo", "zzzzzzzz") = 0.0 + * distance.apply("hello", "hallo") = 0.88 + * distance.apply("ABC Corporation", "ABC Corp") = 0.91 + * distance.apply("D N H Enterprises Inc", "D &amp; H Enterprises, Inc.") = 0.93 + * distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.94 + * distance.apply("PENNSYLVANIA", "PENNCISYLVNIA") = 0.9 * </pre> * * @param left the first String, must not be null @@ -69,19 +75,20 @@ package org.apache.commons.text.similari * @throws IllegalArgumentException if either String input {@code null} */ @Override - public Double compare(CharSequence left, CharSequence right) { - final double DEFAULT_SCALING_FACTOR = 0.1; + public Double apply(CharSequence left, CharSequence right) { + final double defaultScalingFactor = 0.1; + final double percentageRoundValue = 100.0; - if (left == null || right == null) { - throw new IllegalArgumentException("Strings must not be null"); + if (left == null || right == null) { + throw new IllegalArgumentException("Strings must not be null"); } - final double jaro = score(left, right); - final int cl = commonPrefixLength(left, right); - final double matchScore = Math.round((jaro + (DEFAULT_SCALING_FACTOR - * cl * (1.0 - jaro))) *100.0)/100.0; + final double jaro = score(left, right); + final int cl = commonPrefixLength(left, right); + final double matchScore = Math.round((jaro + defaultScalingFactor + * cl * (1.0 - jaro)) * percentageRoundValue) / percentageRoundValue; - return matchScore; + return matchScore; } /** @@ -94,11 +101,11 @@ package org.apache.commons.text.similari */ private static int commonPrefixLength(final CharSequence first, final CharSequence second) { - final int result = getCommonPrefix(first.toString(), second.toString()) - .length(); + final int result = getCommonPrefix(first.toString(), second.toString()) + .length(); // Limit the result to 4. - return result > 4 ? 4 : result; + return result > PREFIX_LENGTH_LIMIT ? PREFIX_LENGTH_LIMIT : result; } /** @@ -136,22 +143,22 @@ package org.apache.commons.text.similari * all null or if there is no common prefix. */ public static String getCommonPrefix(final String... strs) { - if (strs == null || strs.length == 0) { - return ""; + if (strs == null || strs.length == 0) { + return ""; } - final int smallestIndexOfDiff = indexOfDifference(strs); - if (smallestIndexOfDiff == INDEX_NOT_FOUND) { + final int smallestIndexOfDiff = indexOfDifference(strs); + if (smallestIndexOfDiff == INDEX_NOT_FOUND) { // all strings were identical - if (strs[0] == null) { - return ""; + if (strs[0] == null) { + return ""; } - return strs[0]; - } else if (smallestIndexOfDiff == 0) { + return strs[0]; + } else if (smallestIndexOfDiff == 0) { // there were no common initial characters - return ""; + return ""; } else { // we found a common initial character sequence - return strs[0].substring(0, smallestIndexOfDiff); + return strs[0].substring(0, smallestIndexOfDiff); } } @@ -168,47 +175,49 @@ package org.apache.commons.text.similari String longer; // Determine which String is longer. - if (first.length() > second.length()) { - longer = first.toString().toLowerCase(); - shorter = second.toString().toLowerCase(); + if (first.length() > second.length()) { + longer = first.toString().toLowerCase(); + shorter = second.toString().toLowerCase(); } else { - longer = second.toString().toLowerCase(); - shorter = first.toString().toLowerCase(); + longer = second.toString().toLowerCase(); + shorter = first.toString().toLowerCase(); } // Calculate the half length() distance of the shorter String. - final int halflength = shorter.length() / 2 + 1; + final int halflength = shorter.length() / 2 + 1; // Find the set of matching characters between the shorter and longer // strings. Note that // the set of matching characters may be different depending on the // order of the strings. - final String m1 = getSetOfMatchingCharacterWithin(shorter, longer, + final String m1 = getSetOfMatchingCharacterWithin(shorter, longer, halflength); - final String m2 = getSetOfMatchingCharacterWithin(longer, shorter, + final String m2 = getSetOfMatchingCharacterWithin(longer, shorter, halflength); // If one or both of the sets of common characters is empty, then // there is no similarity between the two strings. - if (m1.length() == 0 || m2.length() == 0) { - return 0.0; + if (m1.length() == 0 || m2.length() == 0) { + return 0.0; } // If the set of common characters is not the same size, then // there is no similarity between the two strings, either. - if (m1.length() != m2.length()) { - return 0.0; + if (m1.length() != m2.length()) { + return 0.0; } // Calculate the number of transposition between the two sets // of common characters. - final int transpositions = transpositions(m1, m2); + final int transpositions = transpositions(m1, m2); + + final double defaultDenominator = 3.0; // Calculate the distance. - final double dist = (m1.length() / ((double) shorter.length()) - + m2.length() / ((double) longer.length()) + (m1.length() - transpositions) - / ((double) m1.length())) / 3.0; - return dist; + final double dist = (m1.length() / ((double) shorter.length()) + + m2.length() / ((double) longer.length()) + (m1.length() - transpositions) + / ((double) m1.length())) / defaultDenominator; + return dist; } /** @@ -220,13 +229,13 @@ package org.apache.commons.text.similari */ protected static int transpositions(final CharSequence first, final CharSequence second) { - int transpositions = 0; - for (int i = 0; i < first.length(); i++) { - if (first.charAt(i) != second.charAt(i)) { - transpositions++; + int transpositions = 0; + for (int i = 0; i < first.length(); i++) { + if (first.charAt(i) != second.charAt(i)) { + transpositions++; } } - return transpositions / 2; + return transpositions / 2; } /** @@ -263,61 +272,61 @@ package org.apache.commons.text.similari * equal */ protected static int indexOfDifference(final CharSequence... css) { - if (css == null || css.length <= 1) { - return INDEX_NOT_FOUND; + if (css == null || css.length <= 1) { + return INDEX_NOT_FOUND; } - boolean anyStringNull = false; - boolean allStringsNull = true; - final int arrayLen = css.length; - int shortestStrLen = Integer.MAX_VALUE; - int longestStrLen = 0; + boolean anyStringNull = false; + boolean allStringsNull = true; + final int arrayLen = css.length; + int shortestStrLen = Integer.MAX_VALUE; + int longestStrLen = 0; // find the min and max string lengths; this avoids checking to make // sure we are not exceeding the length of the string each time through // the bottom loop. - for (int i = 0; i < arrayLen; i++) { - if (css[i] == null) { - anyStringNull = true; - shortestStrLen = 0; + for (int i = 0; i < arrayLen; i++) { + if (css[i] == null) { + anyStringNull = true; + shortestStrLen = 0; } else { - allStringsNull = false; - shortestStrLen = Math.min(css[i].length(), shortestStrLen); - longestStrLen = Math.max(css[i].length(), longestStrLen); + allStringsNull = false; + shortestStrLen = Math.min(css[i].length(), shortestStrLen); + longestStrLen = Math.max(css[i].length(), longestStrLen); } } // handle lists containing all nulls or all empty strings - if (allStringsNull || longestStrLen == 0 && !anyStringNull) { - return INDEX_NOT_FOUND; + if (allStringsNull || longestStrLen == 0 && !anyStringNull) { + return INDEX_NOT_FOUND; } // handle lists containing some nulls or some empty strings - if (shortestStrLen == 0) { - return 0; + if (shortestStrLen == 0) { + return 0; } // find the position with the first difference across all strings - int firstDiff = -1; - for (int stringPos = 0; stringPos < shortestStrLen; stringPos++) { - final char comparisonChar = css[0].charAt(stringPos); - for (int arrayPos = 1; arrayPos < arrayLen; arrayPos++) { - if (css[arrayPos].charAt(stringPos) != comparisonChar) { - firstDiff = stringPos; - break; + int firstDiff = -1; + for (int stringPos = 0; stringPos < shortestStrLen; stringPos++) { + final char comparisonChar = css[0].charAt(stringPos); + for (int arrayPos = 1; arrayPos < arrayLen; arrayPos++) { + if (css[arrayPos].charAt(stringPos) != comparisonChar) { + firstDiff = stringPos; + break; } } - if (firstDiff != -1) { - break; + if (firstDiff != -1) { + break; } } - if (firstDiff == -1 && shortestStrLen != longestStrLen) { + if (firstDiff == -1 && shortestStrLen != longestStrLen) { // we compared all of the characters up to the length of the // shortest string and didn't find a match, but the string lengths // vary, so return the length of the shortest string. - return shortestStrLen; + return shortestStrLen; } - return firstDiff; + return firstDiff; } /** @@ -336,25 +345,25 @@ package org.apache.commons.text.similari */ protected static String getSetOfMatchingCharacterWithin( final CharSequence first, final CharSequence second, final int limit) { - final StringBuilder common = new StringBuilder(); - final StringBuilder copy = new StringBuilder(second); + final StringBuilder common = new StringBuilder(); + final StringBuilder copy = new StringBuilder(second); - for (int i = 0; i < first.length(); i++) { - final char ch = first.charAt(i); - boolean found = false; + for (int i = 0; i < first.length(); i++) { + final char ch = first.charAt(i); + boolean found = false; // See if the character is within the limit positions away from the // original position of that character. - for (int j = Math.max(0, i - limit); !found - && j < Math.min(i + limit, second.length()); j++) { - if (copy.charAt(j) == ch) { - found = true; - common.append(ch); - copy.setCharAt(j, '*'); + for (int j = Math.max(0, i - limit); !found + && j < Math.min(i + limit, second.length()); j++) { + if (copy.charAt(j) == ch) { + found = true; + common.append(ch); + copy.setCharAt(j, '*'); } } } - return common.toString(); + return common.toString(); } } Modified: websites/production/commons/content/sandbox/commons-text/jacoco/org.apache.commons.text.similarity/LevenshteinDistance.html ============================================================================== --- websites/production/commons/content/sandbox/commons-text/jacoco/org.apache.commons.text.similarity/LevenshteinDistance.html (original) +++ websites/production/commons/content/sandbox/commons-text/jacoco/org.apache.commons.text.similarity/LevenshteinDistance.html Fri Apr 17 06:46:28 2015 @@ -1 +1 @@ -LevenshteinDistance

LevenshteinDistance

ElementMissed InstructionsCov.Missed BranchesCov.MissedCxtyMissedLinesMissedMethods
Total1 of 214100%1 of 3297%11904403
compare(CharSequence, CharSequence, int)204100%13197%11704201
compare(CharSequence, CharSequence)6100%n/a010101
LevenshteinDistance()3100%n/a010101
\ No newline at end of file +LevenshteinDistance

LevenshteinDistance

ElementMissed InstructionsCov.Missed BranchesCov.MissedCxtyMissedLinesMissedMethods
Total16 of 37396%4 of 5693%63648328
limitedCompare(CharSequence, CharSequence, int)1118994%42888%41724201
getThreshold()30%n/a111111
getDefaultInstance()20%n/a111111
unlimitedCompare(CharSequence, CharSeque nce)127100%18100%01002801
LevenshteinDistance(Integer)16100%4100%030501
apply(CharSequence, CharSequence)16100%2100%020301
static {...}5100%n/a010101
LevenshteinDistance()4100%n/a010201
\ No newline at end of file Modified: websites/production/commons/content/sandbox/commons-text/jacoco/org.apache.commons.text.similarity/LevenshteinDistance.java.html ============================================================================== --- websites/production/commons/content/sandbox/commons-text/jacoco/org.apache.commons.text.similarity/LevenshteinDistance.java.html (original) +++ websites/production/commons/content/sandbox/commons-text/jacoco/org.apache.commons.text.similarity/LevenshteinDistance.java.html Fri Apr 17 06:46:28 2015 @@ -30,52 +30,106 @@ import java.util.Arrays; * <p> * This code has been adapted from Apache Commons Lang 3.3. * </p> + * + * @since 1.0 */ -public class LevenshteinDistance implements StringMetric<Integer> { +public class LevenshteinDistance implements EditDistance<Integer> { + + /** + * Default instance. + */ + private static final LevenshteinDistance DEFAULT_INSTANCE = new LevenshteinDistance(); + + /** + * Threshold. + */ + private final Integer threshold; /** - * Find the Levenshtein distance between two Strings. - * - * <p>A higher score indicates a greater distance.</p> - * * <p> - * The previous implementation of the Levenshtein distance algorithm was - * from <a - * href="http://www.merriampark.com/ld.htm">http://www.merriampark.com - * /ld.htm</a> + * This returns the default instance that uses a version + * of the algorithm that does not use a threshold parameter. * </p> * + * @see LevenshteinDistance#getDefaultInstance() + */ + public LevenshteinDistance() { + this(null); + } + + /** * <p> - * Chas Emerick has written an implementation in Java, which avoids an - * OutOfMemoryError which can occur when my Java implementation is used with - * very large strings.<br> - * This implementation of the Levenshtein distance algorithm is from <a - * href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ - * ldjava.htm</a> + * If the threshold is not null, distance calculations will be limited to a maximum length. + * If the threshold is null, the unlimited version of the algorithm will be used. * </p> * + * @param threshold + * If this is null then distances calculations will not be limited. + * This may not be negative. + */ + public LevenshteinDistance(final Integer threshold) { + if (threshold != null && threshold < 0) { + throw new IllegalArgumentException("Threshold must not be negative"); + } + this.threshold = threshold; + } + + /** + * <p>Find the Levenshtein distance between two Strings.</p> + * + * <p>A higher score indicates a greater distance.</p> + * + * <p>The previous implementation of the Levenshtein distance algorithm + * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p> + * + * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError + * which can occur when my Java implementation is used with very large strings.<br> + * This implementation of the Levenshtein distance algorithm + * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p> + * * <pre> - * distance.compare(null, *) = IllegalArgumentException - * distance.compare(*, null) = IllegalArgumentException - * distance.compare("","") = 0 - * distance.compare("","a") = 1 - * distance.compare("aaapppp", "") = 7 - * distance.compare("frog", "fog") = 1 - * distance.compare("fly", "ant") = 3 - * distance.compare("elephant", "hippo") = 7 - * distance.compare("hippo", "elephant") = 7 - * distance.compare("hippo", "zzzzzzzz") = 8 - * distance.compare("hello", "hallo") = 1 + * distance.apply(null, *) = IllegalArgumentException + * distance.apply(*, null) = IllegalArgumentException + * distance.apply("","") = 0 + * distance.apply("","a") = 1 + * distance.apply("aaapppp", "") = 7 + * distance.apply("frog", "fog") = 1 + * distance.apply("fly", "ant") = 3 + * distance.apply("elephant", "hippo") = 7 + * distance.apply("hippo", "elephant") = 7 + * distance.apply("hippo", "zzzzzzzz") = 8 + * distance.apply("hello", "hallo") = 1 * </pre> * * @param left the first string, must not be null * @param right the second string, must not be null - * @return result distance + * @return result distance, or -1 * @throws IllegalArgumentException if either String input {@code null} */ - @Override - public Integer compare(CharSequence left, CharSequence right) { - return compare(left, right, Integer.MAX_VALUE); + public Integer apply(CharSequence left, CharSequence right) { + if (threshold != null) { + return limitedCompare(left, right, threshold); + } else { + return unlimitedCompare(left, right); + } + } + + /** + * Gets the default instance. + * + * @return the default instace + */ + public static LevenshteinDistance getDefaultInstance() { + return DEFAULT_INSTANCE; + } + + /** + * Gets the distance threshold. + * + * @return the distance threshold + */ + public Integer getThreshold() { + return threshold; } /** @@ -91,32 +145,30 @@ import java.util.Arrays; * </p> * * <pre> - * distance.compare(null, *, *) = IllegalArgumentException - * distance.compare(*, null, *) = IllegalArgumentException - * distance.compare(*, *, -1) = IllegalArgumentException - * distance.compare("","", 0) = 0 - * distance.compare("aaapppp", "", 8) = 7 - * distance.compare("aaapppp", "", 7) = 7 - * distance.compare("aaapppp", "", 6)) = -1 - * distance.compare("elephant", "hippo", 7) = 7 - * distance.compare("elephant", "hippo", 6) = -1 - * distance.compare("hippo", "elephant", 7) = 7 - * distance.compare("hippo", "elephant", 6) = -1 + * limitedCompare(null, *, *) = IllegalArgumentException + * limitedCompare(*, null, *) = IllegalArgumentException + * limitedCompare(*, *, -1) = IllegalArgumentException + * limitedCompare("","", 0) = 0 + * limitedCompare("aaapppp", "", 8) = 7 + * limitedCompare("aaapppp", "", 7) = 7 + * limitedCompare("aaapppp", "", 6)) = -1 + * limitedCompare("elephant", "hippo", 7) = 7 + * limitedCompare("elephant", "hippo", 6) = -1 + * limitedCompare("hippo", "elephant", 7) = 7 + * limitedCompare("hippo", "elephant", 6) = -1 * </pre> * * @param left the first string, must not be null * @param right the second string, must not be null * @param threshold the target threshold, must not be negative - * @return result distance - * @throws IllegalArgumentException if either String input {@code null} or - * negative threshold - */ - public Integer compare(CharSequence left, CharSequence right, int threshold) { - if (left == null || right == null) { - throw new IllegalArgumentException("Strings must not be null"); + * @return result distance, or -1 + */ + private static int limitedCompare(CharSequence left, CharSequence right, int threshold) { + if (left == null || right == null) { + throw new IllegalArgumentException("Strings must not be null"); } - if (threshold < 0) { - throw new IllegalArgumentException("Threshold must not be negative"); + if (threshold < 0) { + throw new IllegalArgumentException("Threshold must not be negative"); } /* @@ -143,8 +195,16 @@ import java.util.Arrays; * and our threshold is 1. In this case we're going to walk a stripe of * length 3. The matrix would look like so: * - * 1 2 3 4 5 1 |#|#| | | | 2 |#|#|#| | | 3 | |#|#|#| | 4 | | |#|#|#| 5 | - * | | |#|#| 6 | | | | |#| 7 | | | | | | + * <pre> + * 1 2 3 4 5 + * 1 |#|#| | | | + * 2 |#|#|#| | | + * 3 | |#|#|#| | + * 4 | | |#|#|#| + * 5 | | | |#|#| + * 6 | | | | |#| + * 7 | | | | | | + * </pre> * * Note how the stripe leads off the table as there is no possible way * to turn a string of length 5 into one of length 7 in edit distance of @@ -161,86 +221,195 @@ import java.util.Arrays; * some discussion. */ - int n = left.length(); // length of s - int m = right.length(); // length of t + int n = left.length(); // length of left + int m = right.length(); // length of right // if one string is empty, the edit distance is necessarily the length // of the other - if (n == 0) { - return m <= threshold ? m : -1; - } else if (m == 0) { - return n <= threshold ? n : -1; + if (n == 0) { + return m <= threshold ? m : -1; + } else if (m == 0) { + return n <= threshold ? n : -1; } - if (n > m) { + if (n > m) { // swap the two strings to consume less memory - final CharSequence tmp = left; - left = right; - right = tmp; - n = m; - m = right.length(); + final CharSequence tmp = left; + left = right; + right = tmp; + n = m; + m = right.length(); } - int[] p = new int[n + 1]; // 'previous' cost array, horizontally - int[] d = new int[n + 1]; // cost array, horizontally - int[] _d; // placeholder to assist in swapping p and d + int[] p = new int[n + 1]; // 'previous' cost array, horizontally + int[] d = new int[n + 1]; // cost array, horizontally + int[] tempD; // placeholder to assist in swapping p and d // fill in starting table values - final int boundary = Math.min(n, threshold) + 1; - for (int i = 0; i < boundary; i++) { - p[i] = i; + final int boundary = Math.min(n, threshold) + 1; + for (int i = 0; i < boundary; i++) { + p[i] = i; } // these fills ensure that the value above the rightmost entry of our // stripe will be ignored in following loop iterations - Arrays.fill(p, boundary, p.length, Integer.MAX_VALUE); - Arrays.fill(d, Integer.MAX_VALUE); + Arrays.fill(p, boundary, p.length, Integer.MAX_VALUE); + Arrays.fill(d, Integer.MAX_VALUE); // iterates through t - for (int j = 1; j <= m; j++) { - final char t_j = right.charAt(j - 1); // jth character of t - d[0] = j; + for (int j = 1; j <= m; j++) { + final char rightJ = right.charAt(j - 1); // jth character of right + d[0] = j; // compute stripe indices, constrain to array size - final int min = Math.max(1, j - threshold); - final int max = j > Integer.MAX_VALUE - threshold ? n : Math.min( + final int min = Math.max(1, j - threshold); + final int max = j > Integer.MAX_VALUE - threshold ? n : Math.min( n, j + threshold); // the stripe may lead off of the table if s and t are of different // sizes - if (min > max) { - return -1; + if (min > max) { + return -1; } // ignore entry left of leftmost - if (min > 1) { - d[min - 1] = Integer.MAX_VALUE; + if (min > 1) { + d[min - 1] = Integer.MAX_VALUE; } // iterates through [min, max] in s - for (int i = min; i <= max; i++) { - if (left.charAt(i - 1) == t_j) { + for (int i = min; i <= max; i++) { + if (left.charAt(i - 1) == rightJ) { // diagonally left and up - d[i] = p[i - 1]; + d[i] = p[i - 1]; } else { // 1 + minimum of cell to the left, to the top, diagonally // left and up - d[i] = 1 + Math.min(Math.min(d[i - 1], p[i]), p[i - 1]); + d[i] = 1 + Math.min(Math.min(d[i - 1], p[i]), p[i - 1]); } } // copy current distance counts to 'previous row' distance counts - _d = p; - p = d; - d = _d; + tempD = p; + p = d; + d = tempD; } // if p[n] is greater than the threshold, there's no guarantee on it // being the correct // distance - if (p[n] <= threshold) { - return p[n]; + if (p[n] <= threshold) { + return p[n]; + } + return -1; + } + + /** + * <p>Find the Levenshtein distance between two Strings.</p> + * + * <p>A higher score indicates a greater distance.</p> + * + * <p>The previous implementation of the Levenshtein distance algorithm + * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p> + * + * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError + * which can occur when my Java implementation is used with very large strings.<br> + * This implementation of the Levenshtein distance algorithm + * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p> + * + * <pre> + * unlimitedCompare(null, *) = IllegalArgumentException + * unlimitedCompare(*, null) = IllegalArgumentException + * unlimitedCompare("","") = 0 + * unlimitedCompare("","a") = 1 + * unlimitedCompare("aaapppp", "") = 7 + * unlimitedCompare("frog", "fog") = 1 + * unlimitedCompare("fly", "ant") = 3 + * unlimitedCompare("elephant", "hippo") = 7 + * unlimitedCompare("hippo", "elephant") = 7 + * unlimitedCompare("hippo", "zzzzzzzz") = 8 + * unlimitedCompare("hello", "hallo") = 1 + * </pre> + * + * @param left the first String, must not be null + * @param right the second String, must not be null + * @return result distance, or -1 + * @throws IllegalArgumentException if either String input {@code null} + */ + private static int unlimitedCompare(CharSequence left, CharSequence right) { + if (left == null || right == null) { + throw new IllegalArgumentException("Strings must not be null"); + } + + /* + The difference between this impl. and the previous is that, rather + than creating and retaining a matrix of size s.length() + 1 by t.length() + 1, + we maintain two single-dimensional arrays of length s.length() + 1. The first, d, + is the 'current working' distance array that maintains the newest distance cost + counts as we iterate through the characters of String s. Each time we increment + the index of String t we are comparing, d is copied to p, the second int[]. Doing so + allows us to retain the previous cost counts as required by the algorithm (taking + the minimum of the cost count to the left, up one, and diagonally up and to the left + of the current cost count being calculated). (Note that the arrays aren't really + copied anymore, just switched...this is clearly much better than cloning an array + or doing a System.arraycopy() each time through the outer loop.) + + Effectively, the difference between the two implementations is this one does not + cause an out of memory condition when calculating the LD over two very large strings. + */ + + int n = left.length(); // length of left + int m = right.length(); // length of right + + if (n == 0) { + return m; + } else if (m == 0) { + return n; + } + + if (n > m) { + // swap the input strings to consume less memory + final CharSequence tmp = left; + left = right; + right = tmp; + n = m; + m = right.length(); } - return -1; + + int[] p = new int[n + 1]; //'previous' cost array, horizontally + int[] d = new int[n + 1]; // cost array, horizontally + int[] tempD; //placeholder to assist in swapping p and d + + // indexes into strings left and right + int i; // iterates through left + int j; // iterates through right + + char rightJ; // jth character of right + + int cost; // cost + + for (i = 0; i <= n; i++) { + p[i] = i; + } + + for (j = 1; j <= m; j++) { + rightJ = right.charAt(j - 1); + d[0] = j; + + for (i = 1; i <= n; i++) { + cost = left.charAt(i - 1) == rightJ ? 0 : 1; + // minimum of cell to the left+1, to the top+1, diagonally left and up +cost + d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost); + } + + // copy current distance counts to 'previous row' distance counts + tempD = p; + p = d; + d = tempD; + } + + // our last action in the above loop was to switch d and p, so p now + // actually has the most recent cost counts + return p[n]; } } Modified: websites/production/commons/content/sandbox/commons-text/jacoco/org.apache.commons.text.similarity/index.html ============================================================================== --- websites/production/commons/content/sandbox/commons-text/jacoco/org.apache.commons.text.similarity/index.html (original) +++ websites/production/commons/content/sandbox/commons-text/jacoco/org.apache.commons.text.similarity/index.html Fri Apr 17 06:46:28 2015 @@ -1 +1 @@ -org.apache.commons.text.similarity

org.apache.commons.text.similarity

82
ElementMissed InstructionsCov.Missed BranchesCov.MissedCxtyMissedLinesMissedMethodsMissedClasses
Total39 of 71795%18 of 12285%17771215911604
JaroWrinklerDistance2734993%164875%1440100801
FuzzyScore67893%16100%1111231301
HammingDistance53888%1990%171100201
LevenshteinDistance213100%13197%1190440301
\ No newline at end of file +org.apache.commons.text.similarity

org.apache.commons.text.similarity

ElementMissed InstructionsCov.Missed BranchesCov.MissedCxtyMissedLinesMissedMethodsMissedClasses
Total65 of 1,08794%25 of 16285%281122024953107
JaroWrinklerDistance2735393%164875%144010870801
LevenshteinDistance1635796%45293%6364832801
CosineSimilarity812594%41071%4112250401
EditDistanceFrom62379%2100%25292401
HammingDistance53888%1990%171100201
FuzzyScore8397%16100%1111261301
CosineDistance43100%n/a02090201
\ No newline at end of file Modified: websites/production/commons/content/sandbox/commons-text/jacoco/org.apache.commons.text.similarity/index.source.html ============================================================================== --- websites/production/commons/content/sandbox/commons-text/jacoco/org.apache.commons.text.similarity/index.source.html (original) +++ websites/production/commons/content/sandbox/commons-text/jacoco/org.apache.commons.text.similarity/index.source.html Fri Apr 17 06:46:28 2015 @@ -1 +1 @@ -org.apache.commons.text.similarity

org.apache.commons.text.similarity

ElementMissed InstructionsCov.Missed BranchesCov.MissedCxtyMissedLinesMissedMethodsMissedClasses
Total39 of 71795%18 of 12285%17771215911604
JaroWrinklerDistance.java2734993%164875%144010820801
FuzzyScore.java67893%16100%1111231301
HammingDistance.java53888%1990%171100201
LevenshteinDistance.java213100%13197%1190440301
\ No newline at end of file +org.apache.commons.text.similarity

org.apache.commons.text.similarity

< td class="ctr2" id="i0">87
ElementMissed InstructionsCov.Missed BranchesCov.MissedCxtyMissedLinesMissedMethodsMissedClasses
Total65 of 1,08794%25 of 16285%281122024953107
JaroWrinklerDistance.java2735393%164875%1440100801
LevenshteinDistance.java1635796%45293%6364832801
CosineSimilarity.java812594%41071%4112250401
EditDistanceFrom.java62379%2100%25292401
HammingDistance.java53888%19 90%171100201
FuzzyScore.java8397%16100%1111261301
CosineDi stance.java43100%n/a02090201
\ No newline at end of file