Return-Path: X-Original-To: apmail-commons-commits-archive@minotaur.apache.org Delivered-To: apmail-commons-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id B56BC1727D for ; Wed, 9 Sep 2015 01:02:06 +0000 (UTC) Received: (qmail 46124 invoked by uid 500); 9 Sep 2015 01:02:06 -0000 Delivered-To: apmail-commons-commits-archive@commons.apache.org Received: (qmail 46034 invoked by uid 500); 9 Sep 2015 01:02:06 -0000 Mailing-List: contact commits-help@commons.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@commons.apache.org Delivered-To: mailing list commits@commons.apache.org Received: (qmail 46024 invoked by uid 99); 9 Sep 2015 01:02:06 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 09 Sep 2015 01:02:06 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 5A787E0252; Wed, 9 Sep 2015 01:02:06 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: psteitz@apache.org To: commits@commons.apache.org Message-Id: <37d34fdc741e42dcb2036d33b5b54d93@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [math] Modified exactP to correctly handle ties. JIRA: MATH-1246. Date: Wed, 9 Sep 2015 01:02:06 +0000 (UTC) Repository: commons-math Updated Branches: refs/heads/master 2091cfbab -> ce98d0085 Modified exactP to correctly handle ties. JIRA: MATH-1246. Project: http://git-wip-us.apache.org/repos/asf/commons-math/repo Commit: http://git-wip-us.apache.org/repos/asf/commons-math/commit/ce98d008 Tree: http://git-wip-us.apache.org/repos/asf/commons-math/tree/ce98d008 Diff: http://git-wip-us.apache.org/repos/asf/commons-math/diff/ce98d008 Branch: refs/heads/master Commit: ce98d00852e21ce34d8d247db7f6be138967b559 Parents: 2091cfb Author: Phil Steitz Authored: Tue Sep 8 18:00:58 2015 -0700 Committer: Phil Steitz Committed: Tue Sep 8 18:00:58 2015 -0700 ---------------------------------------------------------------------- .../stat/inference/KolmogorovSmirnovTest.java | 93 +++++++++++++++++++- .../inference/KolmogorovSmirnovTestTest.java | 78 +++++++++++++--- 2 files changed, 159 insertions(+), 12 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/commons-math/blob/ce98d008/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java b/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java index d47e508..7137bfe 100644 --- a/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java +++ b/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java @@ -19,6 +19,7 @@ package org.apache.commons.math4.stat.inference; import java.math.BigDecimal; import java.util.Arrays; +import java.util.HashSet; import java.util.Iterator; import org.apache.commons.math4.util.Precision; @@ -220,7 +221,10 @@ public class KolmogorovSmirnovTest { * {@value #SMALL_SAMPLE_PRODUCT}), the exact distribution is used to compute the p-value. This * is accomplished by enumerating all partitions of the combined sample into two subsamples of * the respective sample sizes, computing \(D_{n,m}\) for each partition and returning the - * proportion of partitions that give \(D\) values exceeding the observed value. + * proportion of partitions that give \(D\) values exceeding the observed value. In the very + * small sample case, if there are ties in the data, the actual sample values (including ties) + * are used in generating the partitions (which are basically multi-set partitions in this + * case). *
  • For mid-size samples (product of sample sizes greater than or equal to * {@value #SMALL_SAMPLE_PRODUCT} but less than {@value #LARGE_SAMPLE_PRODUCT}), Monte Carlo * simulation is used to compute the p-value. The simulation randomly generates partitions and @@ -243,6 +247,9 @@ public class KolmogorovSmirnovTest { public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) { final long lengthProduct = (long) x.length * y.length; if (lengthProduct < SMALL_SAMPLE_PRODUCT) { + if (hasTies(x, y)) { + return exactP(x, y, strict); + } return exactP(kolmogorovSmirnovStatistic(x, y), x.length, y.length, strict); } if (lengthProduct < LARGE_SAMPLE_PRODUCT) { @@ -909,6 +916,67 @@ public class KolmogorovSmirnovTest { } /** + * Computes the exact p value for a two-sample Kolmogorov-Smirnov test with + * {@code x} and {@code y} as samples, possibly containing ties. This method + * uses the same implementation as {@link #exactP(double, int, int, boolean)} + * with the exception that it examines partitions of the combined sample, + * preserving ties in the data. What is returned is the exact probability + * that a random partition of the combined dataset into a subset of size + * {@code x.length} and another of size {@code y.length} yields a \(D\) + * value greater than (resp greater than or equal to) \(D(x,y)\). + *

    + * This method should not be used on large samples (a good rule of thumb is + * to keep the product of the sample sizes less than + * {@link #SMALL_SAMPLE_PRODUCT} when using this method). If the data do + * not contain ties, {@link #exactP(double[], double[], boolean)} should be + * used instead of this method.

    + * + * @param x first sample + * @param y second sample + * @param strict whether or not the inequality in the null hypothesis is strict + * @return p-value + */ + public double exactP(double[] x, double[] y, boolean strict) { + final double d = kolmogorovSmirnovStatistic(x, y); + final int n = x.length; + final int m = y.length; + + // Concatenate x and y into universe, preserving ties in the data + final double[] universe = new double[n + m]; + System.arraycopy(x, 0, universe, 0, n); + System.arraycopy(y, 0, universe, n, m); + + // Iterate over all n, m partitions of the n + m elements in the universe, + // Computing D for each one + Iterator combinationsIterator = CombinatoricsUtils.combinationsIterator(n + m, n); + long tail = 0; + final double[] nSet = new double[n]; + final double[] mSet = new double[m]; + final double tol = 1e-12; // d-values within tol of one another are considered equal + while (combinationsIterator.hasNext()) { + // Generate an n-set + final int[] nSetI = combinationsIterator.next(); + // Copy the elements of the universe in the n-set to nSet + // and the others to mSet + int j = 0; + int k = 0; + for (int i = 0; i < n + m; i++) { + if (j < n && nSetI[j] == i) { + nSet[j++] = universe[i]; + } else { + mSet[k++] = universe[i]; + } + } + final double curD = kolmogorovSmirnovStatistic(nSet, mSet); + final int order = Precision.compareTo(curD, d, tol); + if (order > 0 || (order == 0 && !strict)) { + tail++; + } + } + return (double) tail / (double) CombinatoricsUtils.binomialCoefficient(n + m, n); + } + + /** * Uses the Kolmogorov-Smirnov distribution to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\) * is the 2-sample Kolmogorov-Smirnov statistic. See * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\). @@ -1009,4 +1077,27 @@ public class KolmogorovSmirnovTest { } return (double) tail / iterations; } + + /** + * Returns true iff there are ties in the combined sample + * formed from x and y. + * + * @param x first sample + * @param y second sample + * @return true if x and y together contain ties + */ + private boolean hasTies(double[] x, double[] y) { + HashSet values = new HashSet(); + for (int i = 0; i < x.length; i++) { + if (!values.add(x[i])) { + return true; + } + } + for (int i = 0; i < y.length; i++) { + if (!values.add(y[i])) { + return true; + } + } + return false; + } } http://git-wip-us.apache.org/repos/asf/commons-math/blob/ce98d008/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java b/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java index e5c03fe..2cc1b1d 100644 --- a/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java +++ b/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java @@ -185,6 +185,62 @@ public class KolmogorovSmirnovTestTest { Assert.assertEquals(0.5, test.kolmogorovSmirnovStatistic(smallSample1, smallSample2), TOLERANCE); } + /** Small sample no ties, exactP methods should agree */ + @Test + public void testExactPConsistency() { + final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest(); + final double[] x = { + 1, 7, 9, 13, 19, 21, 22, 23, 24 + }; + final double[] y = { + 3, 4, 12, 16, 20, 27, 28, 32, 44, 54 + }; + Assert.assertEquals(test.exactP(x, y, true), + test.exactP(test.kolmogorovSmirnovStatistic(x, y), + x.length, y.length, true), Double.MIN_VALUE); + Assert.assertEquals(test.exactP(x, y, false), + test.exactP(test.kolmogorovSmirnovStatistic(x, y), + x.length, y.length, false), Double.MIN_VALUE); + } + + /** + * Extreme case for ties - all values the same. Strict p-value should be 0, + * non-strict should be 1 + */ + @Test + public void testExactPNoVariance() { + final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest(); + final double[] x = { + 1, 1, 1, 1, 1, 1 + }; + final double[] y = { + 1, 1, 1, 1 + }; + Assert.assertEquals(0, test.exactP(x, y, true), Double.MIN_VALUE); + Assert.assertEquals(1, test.exactP(x, y, false), Double.MIN_VALUE); + Assert.assertEquals(0, test.kolmogorovSmirnovTest(x, y, true), Double.MIN_VALUE); + Assert.assertEquals(1, test.kolmogorovSmirnovTest(x, y, false), Double.MIN_VALUE); + } + + /** + * Split {0, 0, 0, 1, 1, 1} into 3-sets. Most extreme is 0's vs 1's. Non-strict + * p-value for this split should be 2 / (6 choose 3); strict should be 0. + */ + @Test + public void testExactPSimpleSplit() { + final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest(); + final double[] x = { + 0, 0, 0 + }; + final double[] y = { + 1, 1, 1 + }; + // Above is one way to do this - other way is s/x/y - so 2 in strict test below + Assert.assertEquals(0, test.exactP(x, y, true), Double.MIN_VALUE); + Assert.assertEquals(2 / (double) CombinatoricsUtils.binomialCoefficient(6, 3), + test.exactP(x, y, false), Double.MIN_VALUE); + } + /** * Checks exact p-value computations using critical values from Table 9 in V.K Rohatgi, An * Introduction to Probability and Mathematical Statistics, Wiley, 1976, ISBN 0-471-73135-8. @@ -430,10 +486,10 @@ public class KolmogorovSmirnovTestTest { Assert.assertEquals(1.0, test.approximateP(0, values.length, values.length), 0.); } } - + /** * JIRA: MATH-1245 - * + * * Verify that D-values are not viewed as distinct when they are mathematically equal * when computing p-statistics for small sample tests. Reference values are from R 3.2.0. */ @@ -444,19 +500,19 @@ public class KolmogorovSmirnovTestTest { final double[] y = {1, 10, 11, 13, 14, 15, 16, 17, 18}; final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest(); Assert.assertEquals(0.0027495724090154106, test.kolmogorovSmirnovTest(x, y,false), tol); - + final double[] x1 = {2, 4, 6, 8, 9, 10, 11, 12, 13}; final double[] y1 = {0, 1, 3, 5, 7}; Assert.assertEquals(0.085914085914085896, test.kolmogorovSmirnovTest(x1, y1, false), tol); - + final double[] x2 = {4, 6, 7, 8, 9, 10, 11}; final double[] y2 = {0, 1, 2, 3, 5}; - Assert.assertEquals(0.015151515151515027, test.kolmogorovSmirnovTest(x2, y2, false), tol); + Assert.assertEquals(0.015151515151515027, test.kolmogorovSmirnovTest(x2, y2, false), tol); } - + /** * JIRA: MATH-1245 - * + * * Verify that D-values are not viewed as distinct when they are mathematically equal * when computing p-statistics for small sample tests. Reference values are from R 3.2.0. */ @@ -465,17 +521,17 @@ public class KolmogorovSmirnovTestTest { final double tol = 1e-2; final int iterations = 1000000; final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest(new Well19937c(1000)); - + final double[] x = {0, 2, 3, 4, 5, 6, 7, 8, 9, 12}; final double[] y = {1, 10, 11, 13, 14, 15, 16, 17, 18}; double d = test.kolmogorovSmirnovStatistic(x, y); Assert.assertEquals(0.0027495724090154106, test.monteCarloP(d, x.length, y.length, false, iterations), tol); - + final double[] x1 = {2, 4, 6, 8, 9, 10, 11, 12, 13}; final double[] y1 = {0, 1, 3, 5, 7}; d = test.kolmogorovSmirnovStatistic(x1, y1); Assert.assertEquals(0.085914085914085896, test.monteCarloP(d, x1.length, y1.length, false, iterations), tol); - + final double[] x2 = {4, 6, 7, 8, 9, 10, 11}; final double[] y2 = {0, 1, 2, 3, 5}; d = test.kolmogorovSmirnovStatistic(x2, y2); @@ -566,4 +622,4 @@ public class KolmogorovSmirnovTestTest { Assert.assertEquals(alpha, test.approximateP(criticalValue, n, m), epsilon); } -} \ No newline at end of file +}