Return-Path: X-Original-To: apmail-commons-commits-archive@minotaur.apache.org Delivered-To: apmail-commons-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id DE660D51F for ; Tue, 11 Dec 2012 05:36:36 +0000 (UTC) Received: (qmail 49083 invoked by uid 500); 11 Dec 2012 05:36:36 -0000 Delivered-To: apmail-commons-commits-archive@commons.apache.org Received: (qmail 48696 invoked by uid 500); 11 Dec 2012 05:36:32 -0000 Mailing-List: contact commits-help@commons.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@commons.apache.org Delivered-To: mailing list commits@commons.apache.org Received: (qmail 48658 invoked by uid 99); 11 Dec 2012 05:36:30 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 11 Dec 2012 05:36:30 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 11 Dec 2012 05:36:28 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id A753223889E0; Tue, 11 Dec 2012 05:36:08 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1420000 - in /commons/proper/math/trunk/src: changes/changes.xml main/java/org/apache/commons/math3/random/EmpiricalDistribution.java test/java/org/apache/commons/math3/random/EmpiricalDistributionTest.java Date: Tue, 11 Dec 2012 05:36:07 -0000 To: commits@commons.apache.org From: psteitz@apache.org X-Mailer: svnmailer-1.0.8-patched Message-Id: <20121211053608.A753223889E0@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: psteitz Date: Tue Dec 11 05:36:06 2012 New Revision: 1420000 URL: http://svn.apache.org/viewvc?rev=1420000&view=rev Log: Added RealDistribution methods to EmpiricalDistribution. JIRA: MATH-672. Modified: commons/proper/math/trunk/src/changes/changes.xml commons/proper/math/trunk/src/main/java/org/apache/commons/math3/random/EmpiricalDistribution.java commons/proper/math/trunk/src/test/java/org/apache/commons/math3/random/EmpiricalDistributionTest.java Modified: commons/proper/math/trunk/src/changes/changes.xml URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/changes/changes.xml?rev=1420000&r1=1419999&r2=1420000&view=diff ============================================================================== --- commons/proper/math/trunk/src/changes/changes.xml (original) +++ commons/proper/math/trunk/src/changes/changes.xml Tue Dec 11 05:36:06 2012 @@ -77,6 +77,10 @@ This is a minor release: It combines bug 2. A few methods in the FastMath class are in fact slower that their counterpart in either Math or StrictMath (cf. MATH-740 and MATH-901). "> + + Added methods to EmpiricalDistribution to implement the RealDistribution + interface. + DBSCAN clustering algorithm (in package "o.a.c.m.stat.clustering"). Modified: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/random/EmpiricalDistribution.java URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/main/java/org/apache/commons/math3/random/EmpiricalDistribution.java?rev=1420000&r1=1419999&r2=1420000&view=diff ============================================================================== --- commons/proper/math/trunk/src/main/java/org/apache/commons/math3/random/EmpiricalDistribution.java (original) +++ commons/proper/math/trunk/src/main/java/org/apache/commons/math3/random/EmpiricalDistribution.java Tue Dec 11 05:36:06 2012 @@ -23,14 +23,18 @@ import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; -import java.io.Serializable; import java.net.URL; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.List; +import org.apache.commons.math3.distribution.AbstractRealDistribution; +import org.apache.commons.math3.distribution.NormalDistribution; +import org.apache.commons.math3.distribution.RealDistribution; import org.apache.commons.math3.exception.MathIllegalStateException; +import org.apache.commons.math3.exception.MathInternalError; import org.apache.commons.math3.exception.NullArgumentException; +import org.apache.commons.math3.exception.OutOfRangeException; import org.apache.commons.math3.exception.ZeroException; import org.apache.commons.math3.exception.util.LocalizedFormats; import org.apache.commons.math3.stat.descriptive.StatisticalSummary; @@ -39,11 +43,12 @@ import org.apache.commons.math3.util.Fas import org.apache.commons.math3.util.MathUtils; /** - * Represents an + *

Represents an * empirical probability distribution -- a probability distribution derived * from observed data without making any assumptions about the functional form - * of the population distribution that the data come from.

- * An EmpiricalDistribution maintains data structures, called + * of the population distribution that the data come from.

+ * + *

An EmpiricalDistribution maintains data structures, called * distribution digests, that describe empirical distributions and * support the following operations:

    *
  • loading the distribution from a file of observed data values
  • @@ -57,6 +62,7 @@ import org.apache.commons.math3.util.Mat * frequency histograms representing the input data or to generate random values * "like" those in the input file -- i.e., the values generated will follow the * distribution of the values in the file.

    + * *

    The implementation uses what amounts to the * * Variable Kernel Method with Gaussian smoothing:

    @@ -71,7 +77,18 @@ import org.apache.commons.math3.util.Mat *

  • Generate a uniformly distributed value in (0,1)
  • *
  • Select the subinterval to which the value belongs. *
  • Generate a random Gaussian value with mean = mean of the associated - * bin and std dev = std dev of associated bin.
  • + * bin and std dev = std dev of associated bin.

    + * + *

    EmpiricalDistribution implements the {@link RealDistribution} interface + * as follows. Given x within the range of values in the dataset, let B + * be the bin containing x and let K be the within-bin kernel for B. Let P(B-) + * be the sum of the probabilities of the bins below B and let K(B) be the + * mass of B under K (i.e., the integral of the kernel density over B). Then + * set P(X < x) = P(B-) + K(x) / K(B) where K(x) is the kernel distribution + * evaluated at x. This results in a cdf that matches the grouped frequency + * distribution at the bin endpoints and interpolates within bins using + * within-bin kernels.

    + * *USAGE NOTES:
      *
    • The binCount is set by default to 1000. A good rule of thumb * is to set the bin count to approximately the length of the input file divided @@ -82,7 +99,7 @@ import org.apache.commons.math3.util.Mat * * @version $Id$ */ -public class EmpiricalDistribution implements Serializable { +public class EmpiricalDistribution extends AbstractRealDistribution { /** Default bin count */ public static final int DEFAULT_BIN_COUNT = 1000; @@ -192,16 +209,16 @@ public class EmpiricalDistribution imple * * @param in the input data array * @exception NullArgumentException if in is null - * @throws MathIllegalStateException if an IOException occurs */ - public void load(double[] in) throws NullArgumentException, MathIllegalStateException { + public void load(double[] in) throws NullArgumentException { DataAdapter da = new ArrayDataAdapter(in); try { da.computeStats(); // new adapter for the second pass fillBinStats(new ArrayDataAdapter(in)); - } catch (IOException e) { - throw new MathIllegalStateException(e, LocalizedFormats.SIMPLE_MESSAGE, e.getLocalizedMessage()); + } catch (IOException ex) { + // Can't happen + throw new MathInternalError(); } loaded = true; @@ -213,7 +230,7 @@ public class EmpiricalDistribution imple *

      The input file must be an ASCII text file containing one * valid numeric entry per line.

      * - * @param url url of the input file + * @param url url of the input file * * @throws IOException if an IO error occurs * @throws NullArgumentException if url is null @@ -429,9 +446,9 @@ public class EmpiricalDistribution imple */ private int findBin(double value) { return FastMath.min( - FastMath.max((int) FastMath.ceil((value- min) / delta) - 1, 0), + FastMath.max((int) FastMath.ceil((value - min) / delta) - 1, 0), binCount - 1); - } + } /** * Generates a random value from this distribution. @@ -513,9 +530,8 @@ public class EmpiricalDistribution imple */ public double[] getUpperBounds() { double[] binUpperBounds = new double[binCount]; - binUpperBounds[0] = min + delta; - for (int i = 1; i < binCount - 1; i++) { - binUpperBounds[i] = binUpperBounds[i-1] + delta; + for (int i = 0; i < binCount - 1; i++) { + binUpperBounds[i] = min + delta * (i + 1); } binUpperBounds[binCount - 1] = max; return binUpperBounds; @@ -557,4 +573,263 @@ public class EmpiricalDistribution imple public void reSeed(long seed) { randomData.reSeed(seed); } + + // Distribution methods --------------------------- + + /** + * {@inheritDoc} + * @since 3.1 + */ + public double probability(double x) { + return 0; + } + + /** + * {@inheritDoc} + * + *

      Returns the kernel density normalized so that its integral over each bin + * equals the bin mass.

      + * + *

      Algorithm description:

        + *
      1. Find the bin B that x belongs to.
      2. + *
      3. Compute K(B) = the mass of B with respect to the within-bin kernel (i.e., the + * integral of the kernel density over B).
      4. + *
      5. Return k(x) * P(B) / K(B), where k is the within-bin kernel density + * and P(B) is the mass of B.

      + * @since 3.1 + */ + public double density(double x) { + if (x < min || x > max) { + return 0d; + } + final int binIndex = findBin(x); + final RealDistribution kernel = getKernel(binStats.get(binIndex)); + return kernel.density(x) * pB(binIndex) / kB(binIndex); + } + + /** + * {@inheritDoc} + * + *

      Algorithm description:

        + *
      1. Find the bin B that x belongs to.
      2. + *
      3. Compute P(B) = the mass of B and P(B-) = the combined mass of the bins below B.
      4. + *
      5. Compute K(B) = the probability mass of B with respect to the within-bin kernel + * and K(B-) = the kernel distribution evaluated at the lower endpoint of B
      6. + *
      7. Return P(B-) + P(B) * [K(x) - K(B-)] / K(B) where + * K(x) is the within-bin kernel distribution function evaluated at x.

      + * + * @since 3.1 + */ + public double cumulativeProbability(double x) { + if (x < min) { + return 0d; + } else if (x >= max) { + return 1d; + } + final int binIndex = findBin(x); + final double pBminus = pBminus(binIndex); + final double pB = pB(binIndex); + final double[] binBounds = getUpperBounds(); + final double kB = kB(binIndex); + final double lower = binIndex == 0 ? min : binBounds[binIndex - 1]; + final RealDistribution kernel = k(x); + final double withinBinCum = + (kernel.cumulativeProbability(x) - kernel.cumulativeProbability(lower)) / kB; + return pBminus + pB * withinBinCum; + } + + /** + * {@inheritDoc} + * + *

      Algorithm description:

        + *
      1. Find the smallest i such that the sum of the masses of the bins + * through i is at least p.
      2. + *
      3. + * Let K be the within-bin kernel distribution for bin i.
        + * Let K(B) be the mass of B under K.
        + * Let K(B-) be K evaluated at the lower endpoint of B (the combined + * mass of the bins below B under K).
        + * Let P(B) be the probability of bin i.
        + * Let P(B-) be the sum of the bin masses below bin i.
        + * Let pCrit = p - P(B-)
        + *
      4. Return the inverse of K evaluated at
        + * K(B-) + pCrit * K(B) / P(B)
      5. + *

      + * + * @since 3.1 + */ + public double inverseCumulativeProbability(final double p) throws OutOfRangeException { + if (p < 0.0 || p > 1.0) { + throw new OutOfRangeException(p, 0, 1); + } + + if (p == 0.0) { + return getSupportLowerBound(); + } + + if (p == 1.0) { + return getSupportUpperBound(); + } + + int i = 0; + while (cumBinP(i) < p) { + i++; + } + + final RealDistribution kernel = getKernel(binStats.get(i)); + final double kB = kB(i); + final double[] binBounds = getUpperBounds(); + final double lower = i == 0 ? min : binBounds[i - 1]; + final double kBminus = kernel.cumulativeProbability(lower); + final double pB = pB(i); + final double pBminus = pBminus(i); + final double pCrit = p - pBminus; + if (pCrit <= 0) { + return lower; + } + return kernel.inverseCumulativeProbability(kBminus + pCrit * kB / pB); + } + + /** + * {@inheritDoc} + * @since 3.1 + */ + public double getNumericalMean() { + return sampleStats.getMean(); + } + + /** + * {@inheritDoc} + * @since 3.1 + */ + public double getNumericalVariance() { + return sampleStats.getVariance(); + } + + /** + * {@inheritDoc} + * @since 3.1 + */ + public double getSupportLowerBound() { + return min; + } + + /** + * {@inheritDoc} + * @since 3.1 + */ + public double getSupportUpperBound() { + return max; + } + + /** + * {@inheritDoc} + * @since 3.1 + */ + public boolean isSupportLowerBoundInclusive() { + return true; + } + + /** + * {@inheritDoc} + * @since 3.1 + */ + public boolean isSupportUpperBoundInclusive() { + return true; + } + + /** + * {@inheritDoc} + * @since 3.1 + */ + public boolean isSupportConnected() { + return true; + } + + /** + * {@inheritDoc} + * @since 3.1 + */ + @Override + public double sample() { + return getNextValue(); + } + + /** + * {@inheritDoc} + * @since 3.1 + */ + @Override + public void reseedRandomGenerator(long seed) { + randomData.reSeed(seed); + } + + /** + * The probability of bin i. + * + * @param i the index of the bin + * @return the probability that selection begins in bin i + */ + private double pB(int i) { + return i == 0 ? upperBounds[0] : + upperBounds[i] - upperBounds[i - 1]; + } + + /** + * The combined probability of the bins up to but not including bin i. + * + * @param i the index of the bin + * @return the probability that selection begins in a bin below bin i. + */ + private double pBminus(int i) { + return i == 0 ? 0 : upperBounds[i - 1]; + } + + /** + * Mass of bin i under the within-bin kernel of the bin. + * + * @param i index of the bin + * @return the difference in the within-bin kernel cdf between the + * upper and lower endpoints of bin i + */ + @SuppressWarnings("deprecation") + private double kB(int i) { + final double[] binBounds = getUpperBounds(); + final RealDistribution kernel = getKernel(binStats.get(i)); + return i == 0 ? kernel.cumulativeProbability(min, binBounds[0]) : + kernel.cumulativeProbability(binBounds[i - 1], binBounds[i]); + } + + /** + * The within-bin kernel of the bin that x belongs to. + * + * @param x the value to locate within a bin + * @return the within-bin kernel of the bin containing x + */ + private RealDistribution k(double x) { + final int binIndex = findBin(x); + return getKernel(binStats.get(binIndex)); + } + + /** + * The combined probability of the bins up to and including binIndex. + * + * @param binIndex maximum bin index + * @return sum of the probabilities of bins through binIndex + */ + private double cumBinP(int binIndex) { + return upperBounds[binIndex]; + } + + /** + * The within-bin smoothing kernel. + * + * @param bStats summary statistics for the bin + * @return within-bin kernel parameterized by bStats + */ + private RealDistribution getKernel(SummaryStatistics bStats) { + // For now, hard-code Gaussian (only kernel supported) + return new NormalDistribution( + bStats.getMean(), bStats.getStandardDeviation()); + } } Modified: commons/proper/math/trunk/src/test/java/org/apache/commons/math3/random/EmpiricalDistributionTest.java URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/test/java/org/apache/commons/math3/random/EmpiricalDistributionTest.java?rev=1420000&r1=1419999&r2=1420000&view=diff ============================================================================== --- commons/proper/math/trunk/src/test/java/org/apache/commons/math3/random/EmpiricalDistributionTest.java (original) +++ commons/proper/math/trunk/src/test/java/org/apache/commons/math3/random/EmpiricalDistributionTest.java Tue Dec 11 05:36:06 2012 @@ -24,6 +24,12 @@ import java.net.URL; import java.util.ArrayList; import org.apache.commons.math3.TestUtils; +import org.apache.commons.math3.analysis.UnivariateFunction; +import org.apache.commons.math3.analysis.integration.BaseAbstractUnivariateIntegrator; +import org.apache.commons.math3.analysis.integration.IterativeLegendreGaussIntegrator; +import org.apache.commons.math3.distribution.NormalDistribution; +import org.apache.commons.math3.distribution.RealDistribution; +import org.apache.commons.math3.distribution.RealDistributionAbstractTest; import org.apache.commons.math3.exception.NullArgumentException; import org.apache.commons.math3.stat.descriptive.SummaryStatistics; import org.junit.Assert; @@ -36,30 +42,35 @@ import org.junit.Test; * @version $Id$ */ -public final class EmpiricalDistributionTest { +public final class EmpiricalDistributionTest extends RealDistributionAbstractTest { protected EmpiricalDistribution empiricalDistribution = null; protected EmpiricalDistribution empiricalDistribution2 = null; protected File file = null; protected URL url = null; protected double[] dataArray = null; + protected final int n = 10000; @Before - public void setUp() throws IOException { + public void setUp() { + super.setUp(); empiricalDistribution = new EmpiricalDistribution(100); url = getClass().getResource("testData.txt"); - - empiricalDistribution2 = new EmpiricalDistribution(100); - BufferedReader in = + final ArrayList list = new ArrayList(); + try { + empiricalDistribution2 = new EmpiricalDistribution(100); + BufferedReader in = new BufferedReader(new InputStreamReader( url.openStream())); - String str = null; - ArrayList list = new ArrayList(); - while ((str = in.readLine()) != null) { - list.add(Double.valueOf(str)); + String str = null; + while ((str = in.readLine()) != null) { + list.add(Double.valueOf(str)); + } + in.close(); + in = null; + } catch (IOException ex) { + Assert.fail("IOException " + ex); } - in.close(); - in = null; dataArray = new double[list.size()]; int i = 0; @@ -286,4 +297,133 @@ public final class EmpiricalDistribution Assert.assertEquals("mean", 5.069831575018909, stats.getMean(), tolerance); Assert.assertEquals("std dev", 1.0173699343977738, stats.getStandardDeviation(), tolerance); } + + // Setup for distribution tests + + @Override + public RealDistribution makeDistribution() { + // Create a uniform distribution on [0, 10,000] + final double[] sourceData = new double[n + 1]; + for (int i = 0; i < n + 1; i++) { + sourceData[i] = i; + } + EmpiricalDistribution dist = new EmpiricalDistribution(); + dist.load(sourceData); + return dist; + } + + /** Uniform bin mass = 10/10001 == mass of all but the first bin */ + private final double binMass = 10d / (double) (n + 1); + + /** Mass of first bin = 11/10001 */ + private final double firstBinMass = 11d / (double) (n + 1); + + @Override + public double[] makeCumulativeTestPoints() { + final double[] testPoints = new double[] {9, 10, 15, 1000, 5004, 9999}; + return testPoints; + } + + + @Override + public double[] makeCumulativeTestValues() { + /* + * Bins should be [0, 10], (10, 20], ..., (9990, 10000] + * Kernels should be N(4.5, 3.02765), N(14.5, 3.02765)... + * Each bin should have mass 10/10000 = .001 + */ + final double[] testPoints = getCumulativeTestPoints(); + final double[] cumValues = new double[testPoints.length]; + final EmpiricalDistribution empiricalDistribution = (EmpiricalDistribution) makeDistribution(); + final double[] binBounds = empiricalDistribution.getUpperBounds(); + for (int i = 0; i < testPoints.length; i++) { + final int bin = findBin(testPoints[i]); + final double lower = bin == 0 ? empiricalDistribution.getSupportLowerBound() : + binBounds[bin - 1]; + final double upper = binBounds[bin]; + // Compute bMinus = sum or mass of bins below the bin containing the point + // First bin has mass 11 / 10000, the rest have mass 10 / 10000. + final double bMinus = bin == 0 ? 0 : (bin - 1) * binMass + firstBinMass; + final RealDistribution kernel = findKernel(lower, upper); + final double withinBinKernelMass = kernel.cumulativeProbability(lower, upper); + final double kernelCum = kernel.cumulativeProbability(lower, testPoints[i]); + cumValues[i] = bMinus + (bin == 0 ? firstBinMass : binMass) * kernelCum/withinBinKernelMass; + } + return cumValues; + } + + @Override + public double[] makeDensityTestValues() { + final double[] testPoints = getCumulativeTestPoints(); + final double[] densityValues = new double[testPoints.length]; + final EmpiricalDistribution empiricalDistribution = (EmpiricalDistribution) makeDistribution(); + final double[] binBounds = empiricalDistribution.getUpperBounds(); + for (int i = 0; i < testPoints.length; i++) { + final int bin = findBin(testPoints[i]); + final double lower = bin == 0 ? empiricalDistribution.getSupportLowerBound() : + binBounds[bin - 1]; + final double upper = binBounds[bin]; + final RealDistribution kernel = findKernel(lower, upper); + final double withinBinKernelMass = kernel.cumulativeProbability(lower, upper); + final double density = kernel.density(testPoints[i]); + densityValues[i] = density * (bin == 0 ? firstBinMass : binMass) / withinBinKernelMass; + } + return densityValues; + } + + /** + * Modify test integration bounds from the default. Because the distribution + * has discontinuities at bin boundaries, integrals spanning multiple bins + * will face convergence problems. Only test within-bin integrals and spans + * across no more than 3 bin boundaries. + */ + @Override + @Test + public void testDensityIntegrals() { + final RealDistribution distribution = makeDistribution(); + final double tol = 1.0e-9; + final BaseAbstractUnivariateIntegrator integrator = + new IterativeLegendreGaussIntegrator(5, 1.0e-12, 1.0e-10); + final UnivariateFunction d = new UnivariateFunction() { + public double value(double x) { + return distribution.density(x); + } + }; + final double[] lower = {0, 5, 1000, 5001, 9995}; + final double[] upper = {5, 12, 1030, 5010, 10000}; + for (int i = 1; i < 5; i++) { + Assert.assertEquals( + distribution.cumulativeProbability( + lower[i], upper[i]), + integrator.integrate( + 1000000, // Triangle integrals are very slow to converge + d, lower[i], upper[i]), tol); + } + } + + /** + * Find the bin that x belongs (relative to {@link #makeDistribution()}). + */ + private int findBin(double x) { + // Number of bins below x should be trunc(x/10) + final double nMinus = Math.floor(x / 10); + final int bin = (int) Math.round(nMinus); + // If x falls on a bin boundary, it is in the lower bin + return Math.floor(x / 10) == x / 10 ? bin - 1 : bin; + } + + /** + * Find the within-bin kernel for the bin with lower bound lower + * and upper bound upper. All bins other than the first contain 10 points + * exclusive of the lower bound and are centered at (lower + upper + 1) / 2. + * The first bin includes its lower bound, 0, so has different mean and + * standard deviation. + */ + private RealDistribution findKernel(double lower, double upper) { + if (lower < 1) { + return new NormalDistribution(5d, 3.3166247903554); + } else { + return new NormalDistribution((upper + lower + 1) / 2d, 3.0276503540974917); + } + } }