Mailing-List: contact commits-help@commons.apache.org; run by ezmlm
Precedence: bulk
Reply-To: dev@commons.apache.org
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
From: psteitz@apache.org
To: commits@commons.apache.org
Message-Id: <37d34fdc741e42dcb2036d33b5b54d93@git.apache.org>
Subject: [math] Modified exactP to correctly handle ties.  JIRA: MATH-1246.
Date: Wed,  9 Sep 2015 01:02:06 +0000 (UTC)

Repository: commons-math
Updated Branches:
  refs/heads/master 2091cfbab -> ce98d0085


Modified exactP to correctly handle ties.  JIRA: MATH-1246.


Project: http://git-wip-us.apache.org/repos/asf/commons-math/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-math/commit/ce98d008
Tree: http://git-wip-us.apache.org/repos/asf/commons-math/tree/ce98d008
Diff: http://git-wip-us.apache.org/repos/asf/commons-math/diff/ce98d008

Branch: refs/heads/master
Commit: ce98d00852e21ce34d8d247db7f6be138967b559
Parents: 2091cfb
Author: Phil Steitz <phil.steitz@gmail.com>
Authored: Tue Sep 8 18:00:58 2015 -0700
Committer: Phil Steitz <phil.steitz@gmail.com>
Committed: Tue Sep 8 18:00:58 2015 -0700

----------------------------------------------------------------------
 .../stat/inference/KolmogorovSmirnovTest.java   | 93 +++++++++++++++++++-
 .../inference/KolmogorovSmirnovTestTest.java    | 78 +++++++++++++---
 2 files changed, 159 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-math/blob/ce98d008/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java b/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java
index d47e508..7137bfe 100644
--- a/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java
+++ b/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java
@@ -19,6 +19,7 @@ package org.apache.commons.math4.stat.inference;
 
 import java.math.BigDecimal;
 import java.util.Arrays;
+import java.util.HashSet;
 import java.util.Iterator;
 
 import org.apache.commons.math4.util.Precision;
@@ -220,7 +221,10 @@ public class KolmogorovSmirnovTest {
      * {@value #SMALL_SAMPLE_PRODUCT}), the exact distribution is used to compute the p-value. This
      * is accomplished by enumerating all partitions of the combined sample into two subsamples of
      * the respective sample sizes, computing \(D_{n,m}\) for each partition and returning the
-     * proportion of partitions that give \(D\) values exceeding the observed value.</li>
+     * proportion of partitions that give \(D\) values exceeding the observed value. In the very
+     * small sample case, if there are ties in the data, the actual sample values (including ties)
+     * are used in generating the partitions (which are basically multi-set partitions in this
+     * case).</li>
      * <li>For mid-size samples (product of sample sizes greater than or equal to
      * {@value #SMALL_SAMPLE_PRODUCT} but less than {@value #LARGE_SAMPLE_PRODUCT}), Monte Carlo
      * simulation is used to compute the p-value. The simulation randomly generates partitions and
@@ -243,6 +247,9 @@ public class KolmogorovSmirnovTest {
     public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) {
         final long lengthProduct = (long) x.length * y.length;
         if (lengthProduct < SMALL_SAMPLE_PRODUCT) {
+            if (hasTies(x, y)) {
+                return exactP(x, y, strict);
+            }
             return exactP(kolmogorovSmirnovStatistic(x, y), x.length, y.length, strict);
         }
         if (lengthProduct < LARGE_SAMPLE_PRODUCT) {
@@ -909,6 +916,67 @@ public class KolmogorovSmirnovTest {
     }
 
     /**
+     * Computes the exact p value for a two-sample Kolmogorov-Smirnov test with
+     * {@code x} and {@code y} as samples, possibly containing ties. This method
+     * uses the same implementation as {@link #exactP(double, int, int, boolean)}
+     * with the exception that it examines partitions of the combined sample,
+     * preserving ties in the data.  What is returned is the exact probability
+     * that a random partition of the combined dataset into a subset of size
+     * {@code x.length} and another of size {@code y.length} yields a \(D\)
+     * value greater than (resp greater than or equal to) \(D(x,y)\).
+     * <p>
+     * This method should not be used on large samples (a good rule of thumb is
+     * to keep the product of the sample sizes less than
+     * {@link #SMALL_SAMPLE_PRODUCT} when using this method).  If the data do
+     * not contain ties, {@link #exactP(double[], double[], boolean)} should be
+     * used instead of this method.</p>
+     *
+     * @param x first sample
+     * @param y second sample
+     * @param strict whether or not the inequality in the null hypothesis is strict
+     * @return p-value
+     */
+    public double exactP(double[] x, double[] y, boolean strict) {
+        final double d = kolmogorovSmirnovStatistic(x, y);
+        final int n = x.length;
+        final int m = y.length;
+
+        // Concatenate x and y into universe, preserving ties in the data
+        final double[] universe = new double[n + m];
+        System.arraycopy(x, 0, universe, 0, n);
+        System.arraycopy(y, 0, universe, n, m);
+
+        // Iterate over all n, m partitions of the n + m elements in the universe,
+        // Computing D for each one
+        Iterator<int[]> combinationsIterator = CombinatoricsUtils.combinationsIterator(n + m, n);
+        long tail = 0;
+        final double[] nSet = new double[n];
+        final double[] mSet = new double[m];
+        final double tol = 1e-12;  // d-values within tol of one another are considered equal
+        while (combinationsIterator.hasNext()) {
+            // Generate an n-set
+            final int[] nSetI = combinationsIterator.next();
+            // Copy the elements of the universe in the n-set to nSet
+            // and the others to mSet
+            int j = 0;
+            int k = 0;
+            for (int i = 0; i < n + m; i++) {
+                if (j < n && nSetI[j] == i) {
+                    nSet[j++] = universe[i];
+                } else {
+                    mSet[k++] = universe[i];
+                }
+            }
+            final double curD = kolmogorovSmirnovStatistic(nSet, mSet);
+            final int order = Precision.compareTo(curD, d, tol);
+            if (order > 0 || (order == 0 && !strict)) {
+                tail++;
+            }
+        }
+        return (double) tail / (double) CombinatoricsUtils.binomialCoefficient(n + m, n);
+    }
+
+    /**
      * Uses the Kolmogorov-Smirnov distribution to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\)
      * is the 2-sample Kolmogorov-Smirnov statistic. See
      * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
@@ -1009,4 +1077,27 @@ public class KolmogorovSmirnovTest {
         }
         return (double) tail / iterations;
     }
+
+    /**
+     * Returns true iff there are ties in the combined sample
+     * formed from x and y.
+     *
+     * @param x first sample
+     * @param y second sample
+     * @return true if x and y together contain ties
+     */
+    private boolean hasTies(double[] x, double[] y) {
+        HashSet<Double> values = new HashSet<Double>();
+            for (int i = 0; i < x.length; i++) {
+                if (!values.add(x[i])) {
+                    return true;
+                }
+            }
+            for (int i = 0; i < y.length; i++) {
+                if (!values.add(y[i])) {
+                    return true;
+                }
+            }
+        return false;
+    }
 }

http://git-wip-us.apache.org/repos/asf/commons-math/blob/ce98d008/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java b/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java
index e5c03fe..2cc1b1d 100644
--- a/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java
+++ b/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java
@@ -185,6 +185,62 @@ public class KolmogorovSmirnovTestTest {
         Assert.assertEquals(0.5, test.kolmogorovSmirnovStatistic(smallSample1, smallSample2), TOLERANCE);
     }
 
+    /** Small sample no ties, exactP methods should agree */
+    @Test
+    public void testExactPConsistency() {
+        final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest();
+        final double[] x = {
+            1, 7, 9, 13, 19, 21, 22, 23, 24
+        };
+        final double[] y = {
+            3, 4, 12, 16, 20, 27, 28, 32, 44, 54
+        };
+        Assert.assertEquals(test.exactP(x, y, true),
+                            test.exactP(test.kolmogorovSmirnovStatistic(x, y),
+                                        x.length, y.length, true), Double.MIN_VALUE);
+        Assert.assertEquals(test.exactP(x, y, false),
+                            test.exactP(test.kolmogorovSmirnovStatistic(x, y),
+                                        x.length, y.length, false), Double.MIN_VALUE);
+    }
+
+    /**
+     * Extreme case for ties - all values the same.  Strict p-value should be 0,
+     * non-strict should be 1
+     */
+    @Test
+    public void testExactPNoVariance() {
+        final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest();
+        final double[] x = {
+            1, 1, 1, 1, 1, 1
+        };
+        final double[] y = {
+            1, 1, 1, 1
+        };
+        Assert.assertEquals(0, test.exactP(x, y, true), Double.MIN_VALUE);
+        Assert.assertEquals(1, test.exactP(x, y, false), Double.MIN_VALUE);
+        Assert.assertEquals(0, test.kolmogorovSmirnovTest(x, y, true), Double.MIN_VALUE);
+        Assert.assertEquals(1, test.kolmogorovSmirnovTest(x, y, false), Double.MIN_VALUE);
+    }
+
+    /**
+     * Split {0, 0, 0, 1, 1, 1} into 3-sets.  Most extreme is 0's vs 1's.  Non-strict
+     * p-value for this split should be 2 / (6 choose 3); strict should be 0.
+     */
+    @Test
+    public void testExactPSimpleSplit() {
+        final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest();
+        final double[] x = {
+            0, 0, 0
+        };
+        final double[] y = {
+            1, 1, 1
+        };
+        // Above is one way to do this - other way is s/x/y - so 2 in strict test below
+        Assert.assertEquals(0, test.exactP(x, y, true), Double.MIN_VALUE);
+        Assert.assertEquals(2 / (double) CombinatoricsUtils.binomialCoefficient(6, 3),
+                            test.exactP(x, y, false), Double.MIN_VALUE);
+    }
+
     /**
      * Checks exact p-value computations using critical values from Table 9 in V.K Rohatgi, An
      * Introduction to Probability and Mathematical Statistics, Wiley, 1976, ISBN 0-471-73135-8.
@@ -430,10 +486,10 @@ public class KolmogorovSmirnovTestTest {
             Assert.assertEquals(1.0, test.approximateP(0, values.length, values.length), 0.);
         }
     }
-    
+
     /**
      * JIRA: MATH-1245
-     * 
+     *
      * Verify that D-values are not viewed as distinct when they are mathematically equal
      * when computing p-statistics for small sample tests. Reference values are from R 3.2.0.
      */
@@ -444,19 +500,19 @@ public class KolmogorovSmirnovTestTest {
         final double[] y = {1, 10, 11, 13, 14, 15, 16, 17, 18};
         final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest();
         Assert.assertEquals(0.0027495724090154106, test.kolmogorovSmirnovTest(x, y,false), tol);
-        
+
         final double[] x1 = {2, 4, 6, 8, 9, 10, 11, 12, 13};
         final double[] y1 = {0, 1, 3, 5, 7};
         Assert.assertEquals(0.085914085914085896, test.kolmogorovSmirnovTest(x1, y1, false), tol);
-        
+
         final double[] x2 = {4, 6, 7, 8, 9, 10, 11};
         final double[] y2 = {0, 1, 2, 3, 5};
-        Assert.assertEquals(0.015151515151515027, test.kolmogorovSmirnovTest(x2, y2, false), tol); 
+        Assert.assertEquals(0.015151515151515027, test.kolmogorovSmirnovTest(x2, y2, false), tol);
     }
-    
+
     /**
      * JIRA: MATH-1245
-     * 
+     *
      * Verify that D-values are not viewed as distinct when they are mathematically equal
      * when computing p-statistics for small sample tests. Reference values are from R 3.2.0.
      */
@@ -465,17 +521,17 @@ public class KolmogorovSmirnovTestTest {
         final double tol = 1e-2;
         final int iterations = 1000000;
         final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest(new Well19937c(1000));
-        
+
         final double[] x = {0, 2, 3, 4, 5, 6, 7, 8, 9, 12};
         final double[] y = {1, 10, 11, 13, 14, 15, 16, 17, 18};
         double d = test.kolmogorovSmirnovStatistic(x, y);
         Assert.assertEquals(0.0027495724090154106, test.monteCarloP(d, x.length, y.length, false, iterations), tol);
-        
+
         final double[] x1 = {2, 4, 6, 8, 9, 10, 11, 12, 13};
         final double[] y1 = {0, 1, 3, 5, 7};
         d = test.kolmogorovSmirnovStatistic(x1, y1);
         Assert.assertEquals(0.085914085914085896, test.monteCarloP(d, x1.length, y1.length, false, iterations), tol);
-        
+
         final double[] x2 = {4, 6, 7, 8, 9, 10, 11};
         final double[] y2 = {0, 1, 2, 3, 5};
         d = test.kolmogorovSmirnovStatistic(x2, y2);
@@ -566,4 +622,4 @@ public class KolmogorovSmirnovTestTest {
         Assert.assertEquals(alpha, test.approximateP(criticalValue, n, m), epsilon);
     }
 
-}
\ No newline at end of file
+}