spark-reviews mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sryza <...@git.apache.org>
Subject [GitHub] spark pull request: [SPARK-8598] [MLlib] Implementation of 1-sampl...
Date Wed, 08 Jul 2015 21:27:11 GMT
Github user sryza commented on a diff in the pull request:

    https://github.com/apache/spark/pull/6994#discussion_r34200807
  
    --- Diff: mllib/src/test/scala/org/apache/spark/mllib/stat/HypothesisTestSuite.scala ---
    @@ -153,4 +157,101 @@ class HypothesisTestSuite extends SparkFunSuite with MLlibTestSparkContext
{
           Statistics.chiSqTest(sc.parallelize(continuousFeature, 2))
         }
       }
    +
    +  test("1 sample Kolmogorov-Smirnov test") {
    +    // Create theoretical distributions
    +    val stdNormalDist = new NormalDistribution(0, 1)
    +    val expDist = new ExponentialDistribution(0.6)
    +    val unifDist = new UniformRealDistribution()
    +
    +    // set seeds
    +    val seed = 10L
    +    stdNormalDist.reseedRandomGenerator(seed)
    +    expDist.reseedRandomGenerator(seed)
    +    unifDist.reseedRandomGenerator(seed)
    +
    +    // Sample data from the distributions and parallelize it
    +    val n = 100000
    +    val sampledNorm = sc.parallelize(stdNormalDist.sample(n), 10)
    +    val sampledExp = sc.parallelize(expDist.sample(n), 10)
    +    val sampledUnif = sc.parallelize(unifDist.sample(n), 10)
    +
    +    // Use a apache math commons local KS test to verify calculations
    +    val ksTest = new KolmogorovSmirnovTest()
    +    val pThreshold = 0.05
    +
    +    // Comparing a standard normal sample to a standard normal distribution
    +    val result1 = Statistics.ksTest(sampledNorm, "stdnorm")
    +    val referenceStat1 = ksTest.kolmogorovSmirnovStatistic(stdNormalDist, sampledNorm.collect())
    +    val referencePVal1 = 1 - ksTest.cdf(referenceStat1, n)
    +    // Verify vs apache math commons ks test
    +    assert(result1.statistic === referenceStat1)
    +    assert(result1.pValue === referencePVal1)
    +    // Cannot reject null hypothesis
    +    assert(result1.pValue > pThreshold)
    +
    +    // Comparing an exponential sample to a standard normal distribution
    +    val result2 = Statistics.ksTest(sampledExp, "stdnorm")
    +    val referenceStat2 = ksTest.kolmogorovSmirnovStatistic(stdNormalDist, sampledExp.collect())
    +    val referencePVal2 = 1 - ksTest.cdf(referenceStat2, n)
    +    // verify vs apache math commons ks test
    +    assert(result2.statistic === referenceStat2)
    +    assert(result2.pValue === referencePVal2)
    +    // reject null hypothesis
    +    assert(result2.pValue < pThreshold)
    +
    +    // Testing the use of a user provided CDF function
    +    // Distribution is not serializable, so will have to create in the lambda
    +    val expCDF = (x: Double) => new ExponentialDistribution(0.2).cumulativeProbability(x)
    +
    +    // Comparing an exponential sample with mean X to an exponential distribution with
mean Y
    +    // Where X != Y
    +    val result3 = Statistics.ksTest(sampledExp, expCDF)
    +    val referenceStat3 = ksTest.kolmogorovSmirnovStatistic(new ExponentialDistribution(0.2),
    +      sampledExp.collect())
    +    val referencePVal3 = 1 - ksTest.cdf(referenceStat3, sampledNorm.count().toInt)
    +    // verify vs apache math commons ks test
    +    assert(result3.statistic === referenceStat3)
    +    assert(result3.pValue === referencePVal3)
    +    // reject null hypothesis
    +    assert(result3.pValue < pThreshold)
    +
    +    /*
    +     Comparing results with R's implementation of Kolmogorov-Smirnov for 1 sample
    --- End diff --
    
    Indent this block an extra space


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


Mime
View raw message