Return-Path: X-Original-To: apmail-commons-notifications-archive@minotaur.apache.org Delivered-To: apmail-commons-notifications-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 0DEB617C3C for ; Sun, 1 Mar 2015 12:14:42 +0000 (UTC) Received: (qmail 86010 invoked by uid 500); 1 Mar 2015 12:14:35 -0000 Delivered-To: apmail-commons-notifications-archive@commons.apache.org Received: (qmail 85972 invoked by uid 500); 1 Mar 2015 12:14:35 -0000 Mailing-List: contact notifications-help@commons.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@commons.apache.org Delivered-To: mailing list notifications@commons.apache.org Received: (qmail 85667 invoked by uid 99); 1 Mar 2015 12:14:35 -0000 Received: from eris.apache.org (HELO hades.apache.org) (140.211.11.105) by apache.org (qpsmtpd/0.29) with ESMTP; Sun, 01 Mar 2015 12:14:35 +0000 Received: from hades.apache.org (localhost [127.0.0.1]) by hades.apache.org (ASF Mail Server at hades.apache.org) with ESMTP id 53D6CAC07B8 for ; Sun, 1 Mar 2015 12:14:35 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r941823 [21/21] - in /websites/production/commons/content/sandbox/commons-text: ./ apidocs/ apidocs/org/apache/commons/text/diff/ apidocs/org/apache/commons/text/diff/class-use/ apidocs/org/apache/commons/text/similarity/ apidocs/org/apache... Date: Sun, 01 Mar 2015 12:14:31 -0000 To: notifications@commons.apache.org From: britter@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20150301121435.53D6CAC07B8@hades.apache.org> Modified: websites/production/commons/content/sandbox/commons-text/xref/org/apache/commons/text/similarity/LevenshteinDistance.html ============================================================================== --- websites/production/commons/content/sandbox/commons-text/xref/org/apache/commons/text/similarity/LevenshteinDistance.html (original) +++ websites/production/commons/content/sandbox/commons-text/xref/org/apache/commons/text/similarity/LevenshteinDistance.html Sun Mar 1 12:14:29 2015 @@ -27,245 +27,233 @@ 19 import java.util.Arrays; 20 21 /** -22 * <p> -23 * A string metric for measuring the difference between two sequences. -24 * </p> -25 * -26 * <p> -27 * This code has been adapted from Apache Commons Lang 3.3. +22 * An algorithm for measuring the difference between two character sequences. +23 * +24 * <p> +25 * This is the number of changes needed to change one sequence into another, +26 * where each change is a single character modification (deletion, insertion +27 * or substitution). 28 * </p> 29 * -30 * @since 1.0 -31 */ -32 public class LevenshteinDistance implements StringMetric<Integer> { -33 -34 /** -35 * <p> -36 * Find the Levenshtein distance between two Strings. -37 * </p> +30 * <p> +31 * This code has been adapted from Apache Commons Lang 3.3. +32 * </p> +33 */ +34 public class LevenshteinDistance implements StringMetric<Integer> { +35 +36 /** +37 * Find the Levenshtein distance between two Strings. 38 * -39 * <p> -40 * This is the number of changes needed to change one String into another, -41 * where each change is a single character modification (deletion, insertion -42 * or substitution). -43 * </p> -44 * -45 * <p> -46 * The previous implementation of the Levenshtein distance algorithm was -47 * from <a -48 * href="http://www.merriampark.com/ld.htm">http://www.merriampark.com -49 * /ld.htm</a> -50 * </p> -51 * -52 * <p> -53 * Chas Emerick has written an implementation in Java, which avoids an -54 * OutOfMemoryError which can occur when my Java implementation is used with -55 * very large strings.<br> -56 * This implementation of the Levenshtein distance algorithm is from <a -57 * href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ -58 * ldjava.htm</a> -59 * </p> -60 * -61 * <pre> -62 * StringUtils.getLevenshteinDistance(null, *) = IllegalArgumentException -63 * StringUtils.getLevenshteinDistance(*, null) = IllegalArgumentException -64 * StringUtils.getLevenshteinDistance("","") = 0 -65 * StringUtils.getLevenshteinDistance("","a") = 1 -66 * StringUtils.getLevenshteinDistance("aaapppp", "") = 7 -67 * StringUtils.getLevenshteinDistance("frog", "fog") = 1 -68 * StringUtils.getLevenshteinDistance("fly", "ant") = 3 -69 * StringUtils.getLevenshteinDistance("elephant", "hippo") = 7 -70 * StringUtils.getLevenshteinDistance("hippo", "elephant") = 7 -71 * StringUtils.getLevenshteinDistance("hippo", "zzzzzzzz") = 8 -72 * StringUtils.getLevenshteinDistance("hello", "hallo") = 1 -73 * </pre> -74 * -75 * @param left the first string, must not be null -76 * @param right the second string, must not be null -77 * @return result distance -78 * @throws IllegalArgumentException if either String input {@code null} -79 */ -80 @Override -81 public Integer compare(CharSequence left, CharSequence right) { -82 return compare(left, right, Integer.MAX_VALUE); -83 } -84 -85 /** -86 * <p> -87 * Find the Levenshtein distance between two Strings if it's less than or -88 * equal to a given threshold. -89 * </p> -90 * -91 * <p> -92 * This is the number of changes needed to change one String into another, -93 * where each change is a single character modification (deletion, insertion -94 * or substitution). -95 * </p> -96 * -97 * <p> -98 * This implementation follows from Algorithms on Strings, Trees and -99 * Sequences by Dan Gusfield and Chas Emerick's implementation of the -100 * Levenshtein distance algorithm from <a -101 * href="http://www.merriampark.com/ld.htm" -102 * >http://www.merriampark.com/ld.htm</a>; -103 * </p> -104 * -105 * <pre> -106 * StringUtils.getLevenshteinDistance(null, *, *) = IllegalArgumentException -107 * StringUtils.getLevenshteinDistance(*, null, *) = IllegalArgumentException -108 * StringUtils.getLevenshteinDistance(*, *, -1) = IllegalArgumentException -109 * StringUtils.getLevenshteinDistance("","", 0) = 0 -110 * StringUtils.getLevenshteinDistance("aaapppp", "", 8) = 7 -111 * StringUtils.getLevenshteinDistance("aaapppp", "", 7) = 7 -112 * StringUtils.getLevenshteinDistance("aaapppp", "", 6)) = -1 -113 * StringUtils.getLevenshteinDistance("elephant", "hippo", 7) = 7 -114 * StringUtils.getLevenshteinDistance("elephant", "hippo", 6) = -1 -115 * StringUtils.getLevenshteinDistance("hippo", "elephant", 7) = 7 -116 * StringUtils.getLevenshteinDistance("hippo", "elephant", 6) = -1 -117 * </pre> -118 * -119 * @param left the first string, must not be null -120 * @param right the second string, must not be null -121 * @param threshold the target threshold, must not be negative -122 * @return result distance -123 * @throws IllegalArgumentException if either String input {@code null} or -124 * negative threshold -125 */ -126 public Integer compare(CharSequence left, CharSequence right, int threshold) { -127 if (left == null || right == null) { -128 throw new IllegalArgumentException("Strings must not be null"); -129 } -130 if (threshold < 0) { -131 throw new IllegalArgumentException("Threshold must not be negative"); -132 } -133 -134 /* -135 * This implementation only computes the distance if it's less than or -136 * equal to the threshold value, returning -1 if it's greater. The -137 * advantage is performance: unbounded distance is O(nm), but a bound of -138 * k allows us to reduce it to O(km) time by only computing a diagonal -139 * stripe of width 2k + 1 of the cost table. It is also possible to use -140 * this to compute the unbounded Levenshtein distance by starting the -141 * threshold at 1 and doubling each time until the distance is found; -142 * this is O(dm), where d is the distance. -143 * -144 * One subtlety comes from needing to ignore entries on the border of -145 * our stripe eg. p[] = |#|#|#|* d[] = *|#|#|#| We must ignore the entry -146 * to the left of the leftmost member We must ignore the entry above the -147 * rightmost member +39 * <p>A higher score indicates a greater distance.</p> +40 * +41 * <p> +42 * The previous implementation of the Levenshtein distance algorithm was +43 * from <a +44 * href="http://www.merriampark.com/ld.htm">http://www.merriampark.com +45 * /ld.htm</a> +46 * </p> +47 * +48 * <p> +49 * Chas Emerick has written an implementation in Java, which avoids an +50 * OutOfMemoryError which can occur when my Java implementation is used with +51 * very large strings.<br> +52 * This implementation of the Levenshtein distance algorithm is from <a +53 * href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ +54 * ldjava.htm</a> +55 * </p> +56 * +57 * <pre> +58 * distance.compare(null, *) = IllegalArgumentException +59 * distance.compare(*, null) = IllegalArgumentException +60 * distance.compare("","") = 0 +61 * distance.compare("","a") = 1 +62 * distance.compare("aaapppp", "") = 7 +63 * distance.compare("frog", "fog") = 1 +64 * distance.compare("fly", "ant") = 3 +65 * distance.compare("elephant", "hippo") = 7 +66 * distance.compare("hippo", "elephant") = 7 +67 * distance.compare("hippo", "zzzzzzzz") = 8 +68 * distance.compare("hello", "hallo") = 1 +69 * </pre> +70 * +71 * @param left the first string, must not be null +72 * @param right the second string, must not be null +73 * @return result distance +74 * @throws IllegalArgumentException if either String input {@code null} +75 */ +76 @Override +77 public Integer compare(CharSequence left, CharSequence right) { +78 return compare(left, right, Integer.MAX_VALUE); +79 } +80 +81 /** +82 * Find the Levenshtein distance between two CharSequences if it's less than or +83 * equal to a given threshold. +84 * +85 * <p> +86 * This implementation follows from Algorithms on Strings, Trees and +87 * Sequences by Dan Gusfield and Chas Emerick's implementation of the +88 * Levenshtein distance algorithm from <a +89 * href="http://www.merriampark.com/ld.htm" +90 * >http://www.merriampark.com/ld.htm</a>; +91 * </p> +92 * +93 * <pre> +94 * distance.compare(null, *, *) = IllegalArgumentException +95 * distance.compare(*, null, *) = IllegalArgumentException +96 * distance.compare(*, *, -1) = IllegalArgumentException +97 * distance.compare("","", 0) = 0 +98 * distance.compare("aaapppp", "", 8) = 7 +99 * distance.compare("aaapppp", "", 7) = 7 +100 * distance.compare("aaapppp", "", 6)) = -1 +101 * distance.compare("elephant", "hippo", 7) = 7 +102 * distance.compare("elephant", "hippo", 6) = -1 +103 * distance.compare("hippo", "elephant", 7) = 7 +104 * distance.compare("hippo", "elephant", 6) = -1 +105 * </pre> +106 * +107 * @param left the first string, must not be null +108 * @param right the second string, must not be null +109 * @param threshold the target threshold, must not be negative +110 * @return result distance +111 * @throws IllegalArgumentException if either String input {@code null} or +112 * negative threshold +113 */ +114 public Integer compare(CharSequence left, CharSequence right, int threshold) { +115 if (left == null || right == null) { +116 throw new IllegalArgumentException("Strings must not be null"); +117 } +118 if (threshold < 0) { +119 throw new IllegalArgumentException("Threshold must not be negative"); +120 } +121 +122 /* +123 * This implementation only computes the distance if it's less than or +124 * equal to the threshold value, returning -1 if it's greater. The +125 * advantage is performance: unbounded distance is O(nm), but a bound of +126 * k allows us to reduce it to O(km) time by only computing a diagonal +127 * stripe of width 2k + 1 of the cost table. It is also possible to use +128 * this to compute the unbounded Levenshtein distance by starting the +129 * threshold at 1 and doubling each time until the distance is found; +130 * this is O(dm), where d is the distance. +131 * +132 * One subtlety comes from needing to ignore entries on the border of +133 * our stripe eg. p[] = |#|#|#|* d[] = *|#|#|#| We must ignore the entry +134 * to the left of the leftmost member We must ignore the entry above the +135 * rightmost member +136 * +137 * Another subtlety comes from our stripe running off the matrix if the +138 * strings aren't of the same size. Since string s is always swapped to +139 * be the shorter of the two, the stripe will always run off to the +140 * upper right instead of the lower left of the matrix. +141 * +142 * As a concrete example, suppose s is of length 5, t is of length 7, +143 * and our threshold is 1. In this case we're going to walk a stripe of +144 * length 3. The matrix would look like so: +145 * +146 * 1 2 3 4 5 1 |#|#| | | | 2 |#|#|#| | | 3 | |#|#|#| | 4 | | |#|#|#| 5 | +147 * | | |#|#| 6 | | | | |#| 7 | | | | | | 148 * -149 * Another subtlety comes from our stripe running off the matrix if the -150 * strings aren't of the same size. Since string s is always swapped to -151 * be the shorter of the two, the stripe will always run off to the -152 * upper right instead of the lower left of the matrix. -153 * -154 * As a concrete example, suppose s is of length 5, t is of length 7, -155 * and our threshold is 1. In this case we're going to walk a stripe of -156 * length 3. The matrix would look like so: -157 * -158 * 1 2 3 4 5 1 |#|#| | | | 2 |#|#|#| | | 3 | |#|#|#| | 4 | | |#|#|#| 5 | -159 * | | |#|#| 6 | | | | |#| 7 | | | | | | -160 * -161 * Note how the stripe leads off the table as there is no possible way -162 * to turn a string of length 5 into one of length 7 in edit distance of -163 * 1. -164 * -165 * Additionally, this implementation decreases memory usage by using two -166 * single-dimensional arrays and swapping them back and forth instead of -167 * allocating an entire n by m matrix. This requires a few minor -168 * changes, such as immediately returning when it's detected that the -169 * stripe has run off the matrix and initially filling the arrays with -170 * large values so that entries we don't compute are ignored. -171 * -172 * See Algorithms on Strings, Trees and Sequences by Dan Gusfield for -173 * some discussion. -174 */ -175 -176 int n = left.length(); // length of s -177 int m = right.length(); // length of t -178 -179 // if one string is empty, the edit distance is necessarily the length -180 // of the other -181 if (n == 0) { -182 return m <= threshold ? m : -1; -183 } else if (m == 0) { -184 return n <= threshold ? n : -1; -185 } -186 -187 if (n > m) { -188 // swap the two strings to consume less memory -189 final CharSequence tmp = left; -190 left = right; -191 right = tmp; -192 n = m; -193 m = right.length(); -194 } -195 -196 int[] p = new int[n + 1]; // 'previous' cost array, horizontally -197 int[] d = new int[n + 1]; // cost array, horizontally -198 int[] _d; // placeholder to assist in swapping p and d -199 -200 // fill in starting table values -201 final int boundary = Math.min(n, threshold) + 1; -202 for (int i = 0; i < boundary; i++) { -203 p[i] = i; -204 } -205 // these fills ensure that the value above the rightmost entry of our -206 // stripe will be ignored in following loop iterations -207 Arrays.fill(p, boundary, p.length, Integer.MAX_VALUE); -208 Arrays.fill(d, Integer.MAX_VALUE); -209 -210 // iterates through t -211 for (int j = 1; j <= m; j++) { -212 final char t_j = right.charAt(j - 1); // jth character of t -213 d[0] = j; -214 -215 // compute stripe indices, constrain to array size -216 final int min = Math.max(1, j - threshold); -217 final int max = j > Integer.MAX_VALUE - threshold ? n : Math.min( -218 n, j + threshold); -219 -220 // the stripe may lead off of the table if s and t are of different -221 // sizes -222 if (min > max) { -223 return -1; -224 } -225 -226 // ignore entry left of leftmost -227 if (min > 1) { -228 d[min - 1] = Integer.MAX_VALUE; +149 * Note how the stripe leads off the table as there is no possible way +150 * to turn a string of length 5 into one of length 7 in edit distance of +151 * 1. +152 * +153 * Additionally, this implementation decreases memory usage by using two +154 * single-dimensional arrays and swapping them back and forth instead of +155 * allocating an entire n by m matrix. This requires a few minor +156 * changes, such as immediately returning when it's detected that the +157 * stripe has run off the matrix and initially filling the arrays with +158 * large values so that entries we don't compute are ignored. +159 * +160 * See Algorithms on Strings, Trees and Sequences by Dan Gusfield for +161 * some discussion. +162 */ +163 +164 int n = left.length(); // length of s +165 int m = right.length(); // length of t +166 +167 // if one string is empty, the edit distance is necessarily the length +168 // of the other +169 if (n == 0) { +170 return m <= threshold ? m : -1; +171 } else if (m == 0) { +172 return n <= threshold ? n : -1; +173 } +174 +175 if (n > m) { +176 // swap the two strings to consume less memory +177 final CharSequence tmp = left; +178 left = right; +179 right = tmp; +180 n = m; +181 m = right.length(); +182 } +183 +184 int[] p = new int[n + 1]; // 'previous' cost array, horizontally +185 int[] d = new int[n + 1]; // cost array, horizontally +186 int[] _d; // placeholder to assist in swapping p and d +187 +188 // fill in starting table values +189 final int boundary = Math.min(n, threshold) + 1; +190 for (int i = 0; i < boundary; i++) { +191 p[i] = i; +192 } +193 // these fills ensure that the value above the rightmost entry of our +194 // stripe will be ignored in following loop iterations +195 Arrays.fill(p, boundary, p.length, Integer.MAX_VALUE); +196 Arrays.fill(d, Integer.MAX_VALUE); +197 +198 // iterates through t +199 for (int j = 1; j <= m; j++) { +200 final char t_j = right.charAt(j - 1); // jth character of t +201 d[0] = j; +202 +203 // compute stripe indices, constrain to array size +204 final int min = Math.max(1, j - threshold); +205 final int max = j > Integer.MAX_VALUE - threshold ? n : Math.min( +206 n, j + threshold); +207 +208 // the stripe may lead off of the table if s and t are of different +209 // sizes +210 if (min > max) { +211 return -1; +212 } +213 +214 // ignore entry left of leftmost +215 if (min > 1) { +216 d[min - 1] = Integer.MAX_VALUE; +217 } +218 +219 // iterates through [min, max] in s +220 for (int i = min; i <= max; i++) { +221 if (left.charAt(i - 1) == t_j) { +222 // diagonally left and up +223 d[i] = p[i - 1]; +224 } else { +225 // 1 + minimum of cell to the left, to the top, diagonally +226 // left and up +227 d[i] = 1 + Math.min(Math.min(d[i - 1], p[i]), p[i - 1]); +228 } 229 } 230 -231 // iterates through [min, max] in s -232 for (int i = min; i <= max; i++) { -233 if (left.charAt(i - 1) == t_j) { -234 // diagonally left and up -235 d[i] = p[i - 1]; -236 } else { -237 // 1 + minimum of cell to the left, to the top, diagonally -238 // left and up -239 d[i] = 1 + Math.min(Math.min(d[i - 1], p[i]), p[i - 1]); -240 } -241 } -242 -243 // copy current distance counts to 'previous row' distance counts -244 _d = p; -245 p = d; -246 d = _d; -247 } -248 -249 // if p[n] is greater than the threshold, there's no guarantee on it -250 // being the correct -251 // distance -252 if (p[n] <= threshold) { -253 return p[n]; -254 } -255 return -1; -256 } -257 -258 } +231 // copy current distance counts to 'previous row' distance counts +232 _d = p; +233 p = d; +234 d = _d; +235 } +236 +237 // if p[n] is greater than the threshold, there's no guarantee on it +238 // being the correct +239 // distance +240 if (p[n] <= threshold) { +241 return p[n]; +242 } +243 return -1; +244 } +245 +246 }
- + - + \ No newline at end of file Modified: websites/production/commons/content/sandbox/commons-text/xref/org/apache/commons/text/similarity/StringMetric.html ============================================================================== --- websites/production/commons/content/sandbox/commons-text/xref/org/apache/commons/text/similarity/StringMetric.html (original) +++ websites/production/commons/content/sandbox/commons-text/xref/org/apache/commons/text/similarity/StringMetric.html Sun Mar 1 12:14:29 2015 @@ -25,34 +25,29 @@ 17 package org.apache.commons.text.similarity; 18 19 /** -20 * <p> -21 * Marker interface for <a -22 * href='http://en.wikipedia.org/wiki/String_metric'>String Metrics</a> -23 * </p> -24 * -25 * <p> -26 * A string metric measures the distance between two text strings. -27 * </p> -28 * -29 * @param <R> return type of the edit distance -30 * @since 1.0 -31 */ -32 public interface StringMetric<R> { -33 -34 /** -35 * <p> -36 * Compares two strings. -37 * </p> -38 * -39 * @param left the first string -40 * @param right the second string -41 * @return result distance -42 */ -43 R compare(CharSequence left, CharSequence right); -44 -45 } +20 * Interface for <a href='http://en.wikipedia.org/wiki/String_metric'>String Metrics</a>. +21 * +22 * <p> +23 * A string metric measures the similarity between two character sequences. Depending on +24 * the algorithm, higher values can mean closer strings, or more distant strings. +25 * </p> +26 * +27 * @param <R> The type of similarity score unit used by this StringMetric. +28 */ +29 public interface StringMetric<R> { +30 +31 /** +32 * Compares two CharSequences. +33 * +34 * @param left the first CharSequence +35 * @param right the second CharSequence +36 * @return the similarity score between two CharSequences +37 */ +38 R compare(CharSequence left, CharSequence right); +39 +40 }
- + - + \ No newline at end of file Modified: websites/production/commons/content/sandbox/commons-text/xref/org/apache/commons/text/similarity/package-frame.html ============================================================================== --- websites/production/commons/content/sandbox/commons-text/xref/org/apache/commons/text/similarity/package-frame.html (original) +++ websites/production/commons/content/sandbox/commons-text/xref/org/apache/commons/text/similarity/package-frame.html Sun Mar 1 12:14:29 2015 @@ -16,7 +16,10 @@
  • - FuzzyDistance + FuzzyScore +
  • +
  • + HammingDistance
  • JaroWrinklerDistance Modified: websites/production/commons/content/sandbox/commons-text/xref/org/apache/commons/text/similarity/package-summary.html ============================================================================== --- websites/production/commons/content/sandbox/commons-text/xref/org/apache/commons/text/similarity/package-summary.html (original) +++ websites/production/commons/content/sandbox/commons-text/xref/org/apache/commons/text/similarity/package-summary.html Sun Mar 1 12:14:29 2015 @@ -37,7 +37,12 @@ - FuzzyDistance + FuzzyScore + + + + + HammingDistance @@ -78,7 +83,7 @@
    \ No newline at end of file Modified: websites/production/commons/content/sandbox/commons-text/xref/overview-frame.html ============================================================================== --- websites/production/commons/content/sandbox/commons-text/xref/overview-frame.html (original) +++ websites/production/commons/content/sandbox/commons-text/xref/overview-frame.html Sun Mar 1 12:14:29 2015 @@ -16,10 +16,12 @@ - Modified: websites/production/commons/content/sandbox/commons-text/xref/overview-summary.html ============================================================================== --- websites/production/commons/content/sandbox/commons-text/xref/overview-summary.html (original) +++ websites/production/commons/content/sandbox/commons-text/xref/overview-summary.html Sun Mar 1 12:14:29 2015 @@ -35,6 +35,11 @@ + org.apache.commons.text.diff + + + + org.apache.commons.text.similarity @@ -60,7 +65,7 @@
    \ No newline at end of file Modified: websites/production/commons/content/sandbox/commons-text/xref/stylesheet.css ============================================================================== --- websites/production/commons/content/sandbox/commons-text/xref/stylesheet.css (original) +++ websites/production/commons/content/sandbox/commons-text/xref/stylesheet.css Sun Mar 1 12:14:29 2015 @@ -111,4 +111,4 @@ hr { .jxr_keyword { color: #000; -} +} \ No newline at end of file