getBinStats() {
+ return binStats;
+ }
+
+ /**
+ * Returns a fresh copy of the array of upper bounds for the bins.
+ * Bins are:
+ * [min,upperBounds[0]],(upperBounds[0],upperBounds[1]],...,
+ * (upperBounds[binCount-2], upperBounds[binCount-1] = max].
+ *
+ * Note: In versions 1.0-2.0 of commons-math, this method
+ * incorrectly returned the array of probability generator upper
+ * bounds now returned by {@link #getGeneratorUpperBounds()}.
+ *
+ * @return array of bin upper bounds
+ * @since 2.1
+ */
+ public double[] getUpperBounds() {
+ double[] binUpperBounds = new double[binCount];
+ for (int i = 0; i < binCount - 1; i++) {
+ binUpperBounds[i] = min + delta * (i + 1);
+ }
+ binUpperBounds[binCount - 1] = max;
+ return binUpperBounds;
+ }
+
+ /**
+ * Returns a fresh copy of the array of upper bounds of the subintervals
+ * of [0,1] used in generating data from the empirical distribution.
+ * Subintervals correspond to bins with lengths proportional to bin counts.
+ *
+ * Preconditions:
+ * - the distribution must be loaded before invoking this method
+ *
+ * In versions 1.0-2.0 of commons-math, this array was (incorrectly) returned
+ * by {@link #getUpperBounds()}.
+ *
+ * @since 2.1
+ * @return array of upper bounds of subintervals used in data generation
+ * @throws NullPointerException unless a {@code load} method has been
+ * called beforehand.
+ */
+ public double[] getGeneratorUpperBounds() {
+ int len = upperBounds.length;
+ double[] out = new double[len];
+ System.arraycopy(upperBounds, 0, out, 0, len);
+ return out;
+ }
+
+ /**
+ * Property indicating whether or not the distribution has been loaded.
+ *
+ * @return true if the distribution has been loaded
+ */
+ public boolean isLoaded() {
+ return loaded;
+ }
+
+ // Distribution methods ---------------------------
+
+ /**
+ * {@inheritDoc}
+ * @since 3.1
+ */
+ @Override
+ public double probability(double x) {
+ return 0;
+ }
+
+ /**
+ * {@inheritDoc}
+ *
+ * Returns the kernel density normalized so that its integral over each bin
+ * equals the bin mass.
+ *
+ * Algorithm description:
+ * - Find the bin B that x belongs to.
+ * - Compute K(B) = the mass of B with respect to the within-bin kernel (i.e., the
+ * integral of the kernel density over B).
+ * - Return k(x) * P(B) / K(B), where k is the within-bin kernel density
+ * and P(B) is the mass of B.
+ * @since 3.1
+ */
+ @Override
+ public double density(double x) {
+ if (x < min || x > max) {
+ return 0d;
+ }
+ final int binIndex = findBin(x);
+ final RealDistribution kernel = getKernel(binStats.get(binIndex));
+ return kernel.density(x) * pB(binIndex) / kB(binIndex);
+ }
+
+ /**
+ * {@inheritDoc}
+ *
+ * Algorithm description:
+ * - Find the bin B that x belongs to.
+ * - Compute P(B) = the mass of B and P(B-) = the combined mass of the bins below B.
+ * - Compute K(B) = the probability mass of B with respect to the within-bin kernel
+ * and K(B-) = the kernel distribution evaluated at the lower endpoint of B
+ * - Return P(B-) + P(B) * [K(x) - K(B-)] / K(B) where
+ * K(x) is the within-bin kernel distribution function evaluated at x.
+ * If K is a constant distribution, we return P(B-) + P(B) (counting the full
+ * mass of B).
+ *
+ * @since 3.1
+ */
+ @Override
+ public double cumulativeProbability(double x) {
+ if (x < min) {
+ return 0d;
+ } else if (x >= max) {
+ return 1d;
+ }
+ final int binIndex = findBin(x);
+ final double pBminus = pBminus(binIndex);
+ final double pB = pB(binIndex);
+ final RealDistribution kernel = k(x);
+ if (kernel instanceof ConstantRealDistribution) {
+ if (x < kernel.getNumericalMean()) {
+ return pBminus;
+ } else {
+ return pBminus + pB;
+ }
+ }
+ final double[] binBounds = getUpperBounds();
+ final double kB = kB(binIndex);
+ final double lower = binIndex == 0 ? min : binBounds[binIndex - 1];
+ final double withinBinCum =
+ (kernel.cumulativeProbability(x) - kernel.cumulativeProbability(lower)) / kB;
+ return pBminus + pB * withinBinCum;
+ }
+
+ /**
+ * {@inheritDoc}
+ *
+ * Algorithm description:
+ * - Find the smallest i such that the sum of the masses of the bins
+ * through i is at least p.
+ * -
+ * Let K be the within-bin kernel distribution for bin i.
+ * Let K(B) be the mass of B under K.
+ * Let K(B-) be K evaluated at the lower endpoint of B (the combined
+ * mass of the bins below B under K).
+ * Let P(B) be the probability of bin i.
+ * Let P(B-) be the sum of the bin masses below bin i.
+ * Let pCrit = p - P(B-)
+ * - Return the inverse of K evaluated at
+ * K(B-) + pCrit * K(B) / P(B)
+ *
+ *
+ * @since 3.1
+ */
+ @Override
+ public double inverseCumulativeProbability(final double p) throws OutOfRangeException {
+ if (p < 0.0 || p > 1.0) {
+ throw new OutOfRangeException(p, 0, 1);
+ }
+
+ if (p == 0.0) {
+ return getSupportLowerBound();
+ }
+
+ if (p == 1.0) {
+ return getSupportUpperBound();
+ }
+
+ int i = 0;
+ while (cumBinP(i) < p) {
+ i++;
+ }
+
+ final RealDistribution kernel = getKernel(binStats.get(i));
+ final double kB = kB(i);
+ final double[] binBounds = getUpperBounds();
+ final double lower = i == 0 ? min : binBounds[i - 1];
+ final double kBminus = kernel.cumulativeProbability(lower);
+ final double pB = pB(i);
+ final double pBminus = pBminus(i);
+ final double pCrit = p - pBminus;
+ if (pCrit <= 0) {
+ return lower;
+ }
+ return kernel.inverseCumulativeProbability(kBminus + pCrit * kB / pB);
+ }
+
+ /**
+ * {@inheritDoc}
+ * @since 3.1
+ */
+ @Override
+ public double getNumericalMean() {
+ return sampleStats.getMean();
+ }
+
+ /**
+ * {@inheritDoc}
+ * @since 3.1
+ */
+ @Override
+ public double getNumericalVariance() {
+ return sampleStats.getVariance();
+ }
+
+ /**
+ * {@inheritDoc}
+ * @since 3.1
+ */
+ @Override
+ public double getSupportLowerBound() {
+ return min;
+ }
+
+ /**
+ * {@inheritDoc}
+ * @since 3.1
+ */
+ @Override
+ public double getSupportUpperBound() {
+ return max;
+ }
+
+ /**
+ * {@inheritDoc}
+ * @since 3.1
+ */
+ @Override
+ public boolean isSupportConnected() {
+ return true;
+ }
+
+ /**{@inheritDoc} */
+ @Override
+ public RealDistribution.Sampler createSampler(final UniformRandomProvider rng) {
+ if (!loaded) {
+ throw new MathIllegalStateException(LocalizedFormats.DISTRIBUTION_NOT_LOADED);
+ }
+ return super.createSampler(rng);
+ }
+
+ /**
+ * The probability of bin i.
+ *
+ * @param i the index of the bin
+ * @return the probability that selection begins in bin i
+ */
+ private double pB(int i) {
+ return i == 0 ? upperBounds[0] :
+ upperBounds[i] - upperBounds[i - 1];
+ }
+
+ /**
+ * The combined probability of the bins up to but not including bin i.
+ *
+ * @param i the index of the bin
+ * @return the probability that selection begins in a bin below bin i.
+ */
+ private double pBminus(int i) {
+ return i == 0 ? 0 : upperBounds[i - 1];
+ }
+
+ /**
+ * Mass of bin i under the within-bin kernel of the bin.
+ *
+ * @param i index of the bin
+ * @return the difference in the within-bin kernel cdf between the
+ * upper and lower endpoints of bin i
+ */
+ private double kB(int i) {
+ final double[] binBounds = getUpperBounds();
+ final RealDistribution kernel = getKernel(binStats.get(i));
+ return i == 0 ? kernel.probability(min, binBounds[0]) :
+ kernel.probability(binBounds[i - 1], binBounds[i]);
+ }
+
+ /**
+ * The within-bin kernel of the bin that x belongs to.
+ *
+ * @param x the value to locate within a bin
+ * @return the within-bin kernel of the bin containing x
+ */
+ private RealDistribution k(double x) {
+ final int binIndex = findBin(x);
+ return getKernel(binStats.get(binIndex));
+ }
+
+ /**
+ * The combined probability of the bins up to and including binIndex.
+ *
+ * @param binIndex maximum bin index
+ * @return sum of the probabilities of bins through binIndex
+ */
+ private double cumBinP(int binIndex) {
+ return upperBounds[binIndex];
+ }
+
+ /**
+ * The within-bin smoothing kernel. Returns a Gaussian distribution
+ * parameterized by {@code bStats}, unless the bin contains only one
+ * observation, in which case a constant distribution is returned.
+ *
+ * @param bStats summary statistics for the bin
+ * @return within-bin kernel parameterized by bStats
+ */
+ protected RealDistribution getKernel(SummaryStatistics bStats) {
+ if (bStats.getN() == 1 || bStats.getVariance() == 0) {
+ return new ConstantRealDistribution(bStats.getMean());
+ } else {
+ return new NormalDistribution(bStats.getMean(), bStats.getStandardDeviation(),
+ NormalDistribution.DEFAULT_INVERSE_ABSOLUTE_ACCURACY);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/commons-math/blob/ccba0cfc/src/main/java/org/apache/commons/math4/random/EmpiricalDistribution.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/math4/random/EmpiricalDistribution.java b/src/main/java/org/apache/commons/math4/random/EmpiricalDistribution.java
deleted file mode 100644
index 5439cd2..0000000
--- a/src/main/java/org/apache/commons/math4/random/EmpiricalDistribution.java
+++ /dev/null
@@ -1,748 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.commons.math4.random;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.net.URL;
-import java.nio.charset.Charset;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.commons.math4.distribution.AbstractRealDistribution;
-import org.apache.commons.math4.distribution.ConstantRealDistribution;
-import org.apache.commons.math4.distribution.NormalDistribution;
-import org.apache.commons.math4.distribution.RealDistribution;
-import org.apache.commons.math4.exception.MathIllegalStateException;
-import org.apache.commons.math4.exception.MathInternalError;
-import org.apache.commons.math4.exception.NullArgumentException;
-import org.apache.commons.math4.exception.OutOfRangeException;
-import org.apache.commons.math4.exception.ZeroException;
-import org.apache.commons.math4.exception.NotStrictlyPositiveException;
-import org.apache.commons.math4.exception.util.LocalizedFormats;
-import org.apache.commons.math4.stat.descriptive.StatisticalSummary;
-import org.apache.commons.math4.stat.descriptive.SummaryStatistics;
-import org.apache.commons.math4.rng.UniformRandomProvider;
-import org.apache.commons.math4.util.FastMath;
-import org.apache.commons.math4.util.MathUtils;
-
-/**
- * Represents an
- * empirical probability distribution -- a probability distribution derived
- * from observed data without making any assumptions about the functional form
- * of the population distribution that the data come from.
- *
- * An EmpiricalDistribution
maintains data structures, called
- * distribution digests, that describe empirical distributions and
- * support the following operations:
- * - loading the distribution from a file of observed data values
- * - dividing the input data into "bin ranges" and reporting bin frequency
- * counts (data for histogram)
- * - reporting univariate statistics describing the full set of data values
- * as well as the observations within each bin
- * - generating random values from the distribution
- *
- * Applications can use EmpiricalDistribution
to build grouped
- * frequency histograms representing the input data or to generate random values
- * "like" those in the input file -- i.e., the values generated will follow the
- * distribution of the values in the file.
- *
- * The implementation uses what amounts to the
- *
- * Variable Kernel Method with Gaussian smoothing:
- * Digesting the input file
- *
- Pass the file once to compute min and max.
- * - Divide the range from min-max into
binCount
"bins."
- * - Pass the data file again, computing bin counts and univariate
- * statistics (mean, std dev.) for each of the bins
- * - Divide the interval (0,1) into subintervals associated with the bins,
- * with the length of a bin's subinterval proportional to its count.
- * Generating random values from the distribution
- * - Generate a uniformly distributed value in (0,1)
- * - Select the subinterval to which the value belongs.
- *
- Generate a random Gaussian value with mean = mean of the associated
- * bin and std dev = std dev of associated bin.
- *
- * EmpiricalDistribution implements the {@link RealDistribution} interface
- * as follows. Given x within the range of values in the dataset, let B
- * be the bin containing x and let K be the within-bin kernel for B. Let P(B-)
- * be the sum of the probabilities of the bins below B and let K(B) be the
- * mass of B under K (i.e., the integral of the kernel density over B). Then
- * set P(X < x) = P(B-) + P(B) * K(x) / K(B) where K(x) is the kernel distribution
- * evaluated at x. This results in a cdf that matches the grouped frequency
- * distribution at the bin endpoints and interpolates within bins using
- * within-bin kernels.
- *
- *USAGE NOTES:
- *- The
binCount
is set by default to 1000. A good rule of thumb
- * is to set the bin count to approximately the length of the input file divided
- * by 10.
- *- The input file must be a plain text file containing one valid numeric
- * entry per line.
- *
- *
- */
-public class EmpiricalDistribution extends AbstractRealDistribution {
-
- /** Default bin count */
- public static final int DEFAULT_BIN_COUNT = 1000;
-
- /** Character set for file input */
- private static final String FILE_CHARSET = "US-ASCII";
-
- /** Serializable version identifier */
- private static final long serialVersionUID = 5729073523949762654L;
-
- /** List of SummaryStatistics objects characterizing the bins */
- private final List binStats;
-
- /** Sample statistics */
- private SummaryStatistics sampleStats = null;
-
- /** Max loaded value */
- private double max = Double.NEGATIVE_INFINITY;
-
- /** Min loaded value */
- private double min = Double.POSITIVE_INFINITY;
-
- /** Grid size */
- private double delta = 0d;
-
- /** number of bins */
- private final int binCount;
-
- /** is the distribution loaded? */
- private boolean loaded = false;
-
- /** upper bounds of subintervals in (0,1) "belonging" to the bins */
- private double[] upperBounds = null;
-
- /**
- * Creates a new EmpiricalDistribution with the default bin count.
- */
- public EmpiricalDistribution() {
- this(DEFAULT_BIN_COUNT);
- }
-
- /**
- * Creates a new EmpiricalDistribution with the specified bin count.
- *
- * @param binCount number of bins. Must be strictly positive.
- * @throws NotStrictlyPositiveException if {@code binCount <= 0}.
- */
- public EmpiricalDistribution(int binCount) {
- if (binCount <= 0) {
- throw new NotStrictlyPositiveException(binCount);
- }
- this.binCount = binCount;
- binStats = new ArrayList();
- }
-
- /**
- * Computes the empirical distribution from the provided
- * array of numbers.
- *
- * @param in the input data array
- * @exception NullArgumentException if in is null
- */
- public void load(double[] in) throws NullArgumentException {
- DataAdapter da = new ArrayDataAdapter(in);
- try {
- da.computeStats();
- // new adapter for the second pass
- fillBinStats(new ArrayDataAdapter(in));
- } catch (IOException ex) {
- // Can't happen
- throw new MathInternalError();
- }
- loaded = true;
-
- }
-
- /**
- * Computes the empirical distribution using data read from a URL.
- *
- * The input file must be an ASCII text file containing one
- * valid numeric entry per line.
- *
- * @param url url of the input file
- *
- * @throws IOException if an IO error occurs
- * @throws NullArgumentException if url is null
- * @throws ZeroException if URL contains no data
- */
- public void load(URL url) throws IOException, NullArgumentException, ZeroException {
- MathUtils.checkNotNull(url);
- Charset charset = Charset.forName(FILE_CHARSET);
- BufferedReader in =
- new BufferedReader(new InputStreamReader(url.openStream(), charset));
- try {
- DataAdapter da = new StreamDataAdapter(in);
- da.computeStats();
- if (sampleStats.getN() == 0) {
- throw new ZeroException(LocalizedFormats.URL_CONTAINS_NO_DATA, url);
- }
- // new adapter for the second pass
- in = new BufferedReader(new InputStreamReader(url.openStream(), charset));
- fillBinStats(new StreamDataAdapter(in));
- loaded = true;
- } finally {
- try {
- in.close();
- } catch (IOException ex) { //NOPMD
- // ignore
- }
- }
- }
-
- /**
- * Computes the empirical distribution from the input file.
- *
- * The input file must be an ASCII text file containing one
- * valid numeric entry per line.
- *
- * @param file the input file
- * @throws IOException if an IO error occurs
- * @throws NullArgumentException if file is null
- */
- public void load(File file) throws IOException, NullArgumentException {
- MathUtils.checkNotNull(file);
- Charset charset = Charset.forName(FILE_CHARSET);
- InputStream is = new FileInputStream(file);
- BufferedReader in = new BufferedReader(new InputStreamReader(is, charset));
- try {
- DataAdapter da = new StreamDataAdapter(in);
- da.computeStats();
- // new adapter for second pass
- is = new FileInputStream(file);
- in = new BufferedReader(new InputStreamReader(is, charset));
- fillBinStats(new StreamDataAdapter(in));
- loaded = true;
- } finally {
- try {
- in.close();
- } catch (IOException ex) { //NOPMD
- // ignore
- }
- }
- }
-
- /**
- * Provides methods for computing sampleStats
and
- * beanStats
abstracting the source of data.
- */
- private abstract class DataAdapter{
-
- /**
- * Compute bin stats.
- *
- * @throws IOException if an error occurs computing bin stats
- */
- public abstract void computeBinStats() throws IOException;
-
- /**
- * Compute sample statistics.
- *
- * @throws IOException if an error occurs computing sample stats
- */
- public abstract void computeStats() throws IOException;
-
- }
-
- /**
- * DataAdapter
for data provided through some input stream
- */
- private class StreamDataAdapter extends DataAdapter{
-
- /** Input stream providing access to the data */
- private BufferedReader inputStream;
-
- /**
- * Create a StreamDataAdapter from a BufferedReader
- *
- * @param in BufferedReader input stream
- */
- StreamDataAdapter(BufferedReader in){
- super();
- inputStream = in;
- }
-
- /** {@inheritDoc} */
- @Override
- public void computeBinStats() throws IOException {
- String str = null;
- double val = 0.0d;
- while ((str = inputStream.readLine()) != null) {
- val = Double.parseDouble(str);
- SummaryStatistics stats = binStats.get(findBin(val));
- stats.addValue(val);
- }
-
- inputStream.close();
- inputStream = null;
- }
-
- /** {@inheritDoc} */
- @Override
- public void computeStats() throws IOException {
- String str = null;
- double val = 0.0;
- sampleStats = new SummaryStatistics();
- while ((str = inputStream.readLine()) != null) {
- val = Double.parseDouble(str);
- sampleStats.addValue(val);
- }
- inputStream.close();
- inputStream = null;
- }
- }
-
- /**
- * DataAdapter
for data provided as array of doubles.
- */
- private class ArrayDataAdapter extends DataAdapter {
-
- /** Array of input data values */
- private final double[] inputArray;
-
- /**
- * Construct an ArrayDataAdapter from a double[] array
- *
- * @param in double[] array holding the data
- * @throws NullArgumentException if in is null
- */
- ArrayDataAdapter(double[] in) throws NullArgumentException {
- super();
- MathUtils.checkNotNull(in);
- inputArray = in;
- }
-
- /** {@inheritDoc} */
- @Override
- public void computeStats() throws IOException {
- sampleStats = new SummaryStatistics();
- for (int i = 0; i < inputArray.length; i++) {
- sampleStats.addValue(inputArray[i]);
- }
- }
-
- /** {@inheritDoc} */
- @Override
- public void computeBinStats() throws IOException {
- for (int i = 0; i < inputArray.length; i++) {
- SummaryStatistics stats =
- binStats.get(findBin(inputArray[i]));
- stats.addValue(inputArray[i]);
- }
- }
- }
-
- /**
- * Fills binStats array (second pass through data file).
- *
- * @param da object providing access to the data
- * @throws IOException if an IO error occurs
- */
- private void fillBinStats(final DataAdapter da)
- throws IOException {
- // Set up grid
- min = sampleStats.getMin();
- max = sampleStats.getMax();
- delta = (max - min)/binCount;
-
- // Initialize binStats ArrayList
- if (!binStats.isEmpty()) {
- binStats.clear();
- }
- for (int i = 0; i < binCount; i++) {
- SummaryStatistics stats = new SummaryStatistics();
- binStats.add(i,stats);
- }
-
- // Filling data in binStats Array
- da.computeBinStats();
-
- // Assign upperBounds based on bin counts
- upperBounds = new double[binCount];
- upperBounds[0] =
- ((double) binStats.get(0).getN()) / (double) sampleStats.getN();
- for (int i = 1; i < binCount-1; i++) {
- upperBounds[i] = upperBounds[i-1] +
- ((double) binStats.get(i).getN()) / (double) sampleStats.getN();
- }
- upperBounds[binCount-1] = 1.0d;
- }
-
- /**
- * Returns the index of the bin to which the given value belongs
- *
- * @param value the value whose bin we are trying to find
- * @return the index of the bin containing the value
- */
- private int findBin(double value) {
- return FastMath.min(
- FastMath.max((int) FastMath.ceil((value - min) / delta) - 1, 0),
- binCount - 1);
- }
-
- /**
- * Returns a {@link StatisticalSummary} describing this distribution.
- * Preconditions:
- * - the distribution must be loaded before invoking this method
- *
- * @return the sample statistics
- * @throws IllegalStateException if the distribution has not been loaded
- */
- public StatisticalSummary getSampleStats() {
- return sampleStats;
- }
-
- /**
- * Returns the number of bins.
- *
- * @return the number of bins.
- */
- public int getBinCount() {
- return binCount;
- }
-
- /**
- * Returns a List of {@link SummaryStatistics} instances containing
- * statistics describing the values in each of the bins. The list is
- * indexed on the bin number.
- *
- * @return List of bin statistics.
- */
- public List getBinStats() {
- return binStats;
- }
-
- /**
- * Returns a fresh copy of the array of upper bounds for the bins.
- * Bins are:
- * [min,upperBounds[0]],(upperBounds[0],upperBounds[1]],...,
- * (upperBounds[binCount-2], upperBounds[binCount-1] = max].
- *
- * Note: In versions 1.0-2.0 of commons-math, this method
- * incorrectly returned the array of probability generator upper
- * bounds now returned by {@link #getGeneratorUpperBounds()}.
- *
- * @return array of bin upper bounds
- * @since 2.1
- */
- public double[] getUpperBounds() {
- double[] binUpperBounds = new double[binCount];
- for (int i = 0; i < binCount - 1; i++) {
- binUpperBounds[i] = min + delta * (i + 1);
- }
- binUpperBounds[binCount - 1] = max;
- return binUpperBounds;
- }
-
- /**
- * Returns a fresh copy of the array of upper bounds of the subintervals
- * of [0,1] used in generating data from the empirical distribution.
- * Subintervals correspond to bins with lengths proportional to bin counts.
- *
- * Preconditions:
- * - the distribution must be loaded before invoking this method
- *
- * In versions 1.0-2.0 of commons-math, this array was (incorrectly) returned
- * by {@link #getUpperBounds()}.
- *
- * @since 2.1
- * @return array of upper bounds of subintervals used in data generation
- * @throws NullPointerException unless a {@code load} method has been
- * called beforehand.
- */
- public double[] getGeneratorUpperBounds() {
- int len = upperBounds.length;
- double[] out = new double[len];
- System.arraycopy(upperBounds, 0, out, 0, len);
- return out;
- }
-
- /**
- * Property indicating whether or not the distribution has been loaded.
- *
- * @return true if the distribution has been loaded
- */
- public boolean isLoaded() {
- return loaded;
- }
-
- // Distribution methods ---------------------------
-
- /**
- * {@inheritDoc}
- * @since 3.1
- */
- @Override
- public double probability(double x) {
- return 0;
- }
-
- /**
- * {@inheritDoc}
- *
- * Returns the kernel density normalized so that its integral over each bin
- * equals the bin mass.
- *
- * Algorithm description:
- * - Find the bin B that x belongs to.
- * - Compute K(B) = the mass of B with respect to the within-bin kernel (i.e., the
- * integral of the kernel density over B).
- * - Return k(x) * P(B) / K(B), where k is the within-bin kernel density
- * and P(B) is the mass of B.
- * @since 3.1
- */
- @Override
- public double density(double x) {
- if (x < min || x > max) {
- return 0d;
- }
- final int binIndex = findBin(x);
- final RealDistribution kernel = getKernel(binStats.get(binIndex));
- return kernel.density(x) * pB(binIndex) / kB(binIndex);
- }
-
- /**
- * {@inheritDoc}
- *
- * Algorithm description:
- * - Find the bin B that x belongs to.
- * - Compute P(B) = the mass of B and P(B-) = the combined mass of the bins below B.
- * - Compute K(B) = the probability mass of B with respect to the within-bin kernel
- * and K(B-) = the kernel distribution evaluated at the lower endpoint of B
- * - Return P(B-) + P(B) * [K(x) - K(B-)] / K(B) where
- * K(x) is the within-bin kernel distribution function evaluated at x.
- * If K is a constant distribution, we return P(B-) + P(B) (counting the full
- * mass of B).
- *
- * @since 3.1
- */
- @Override
- public double cumulativeProbability(double x) {
- if (x < min) {
- return 0d;
- } else if (x >= max) {
- return 1d;
- }
- final int binIndex = findBin(x);
- final double pBminus = pBminus(binIndex);
- final double pB = pB(binIndex);
- final RealDistribution kernel = k(x);
- if (kernel instanceof ConstantRealDistribution) {
- if (x < kernel.getNumericalMean()) {
- return pBminus;
- } else {
- return pBminus + pB;
- }
- }
- final double[] binBounds = getUpperBounds();
- final double kB = kB(binIndex);
- final double lower = binIndex == 0 ? min : binBounds[binIndex - 1];
- final double withinBinCum =
- (kernel.cumulativeProbability(x) - kernel.cumulativeProbability(lower)) / kB;
- return pBminus + pB * withinBinCum;
- }
-
- /**
- * {@inheritDoc}
- *
- * Algorithm description:
- * - Find the smallest i such that the sum of the masses of the bins
- * through i is at least p.
- * -
- * Let K be the within-bin kernel distribution for bin i.
- * Let K(B) be the mass of B under K.
- * Let K(B-) be K evaluated at the lower endpoint of B (the combined
- * mass of the bins below B under K).
- * Let P(B) be the probability of bin i.
- * Let P(B-) be the sum of the bin masses below bin i.
- * Let pCrit = p - P(B-)
- * - Return the inverse of K evaluated at
- * K(B-) + pCrit * K(B) / P(B)
- *
- *
- * @since 3.1
- */
- @Override
- public double inverseCumulativeProbability(final double p) throws OutOfRangeException {
- if (p < 0.0 || p > 1.0) {
- throw new OutOfRangeException(p, 0, 1);
- }
-
- if (p == 0.0) {
- return getSupportLowerBound();
- }
-
- if (p == 1.0) {
- return getSupportUpperBound();
- }
-
- int i = 0;
- while (cumBinP(i) < p) {
- i++;
- }
-
- final RealDistribution kernel = getKernel(binStats.get(i));
- final double kB = kB(i);
- final double[] binBounds = getUpperBounds();
- final double lower = i == 0 ? min : binBounds[i - 1];
- final double kBminus = kernel.cumulativeProbability(lower);
- final double pB = pB(i);
- final double pBminus = pBminus(i);
- final double pCrit = p - pBminus;
- if (pCrit <= 0) {
- return lower;
- }
- return kernel.inverseCumulativeProbability(kBminus + pCrit * kB / pB);
- }
-
- /**
- * {@inheritDoc}
- * @since 3.1
- */
- @Override
- public double getNumericalMean() {
- return sampleStats.getMean();
- }
-
- /**
- * {@inheritDoc}
- * @since 3.1
- */
- @Override
- public double getNumericalVariance() {
- return sampleStats.getVariance();
- }
-
- /**
- * {@inheritDoc}
- * @since 3.1
- */
- @Override
- public double getSupportLowerBound() {
- return min;
- }
-
- /**
- * {@inheritDoc}
- * @since 3.1
- */
- @Override
- public double getSupportUpperBound() {
- return max;
- }
-
- /**
- * {@inheritDoc}
- * @since 3.1
- */
- @Override
- public boolean isSupportConnected() {
- return true;
- }
-
- /**{@inheritDoc} */
- @Override
- public RealDistribution.Sampler createSampler(final UniformRandomProvider rng) {
- if (!loaded) {
- throw new MathIllegalStateException(LocalizedFormats.DISTRIBUTION_NOT_LOADED);
- }
- return super.createSampler(rng);
- }
-
- /**
- * The probability of bin i.
- *
- * @param i the index of the bin
- * @return the probability that selection begins in bin i
- */
- private double pB(int i) {
- return i == 0 ? upperBounds[0] :
- upperBounds[i] - upperBounds[i - 1];
- }
-
- /**
- * The combined probability of the bins up to but not including bin i.
- *
- * @param i the index of the bin
- * @return the probability that selection begins in a bin below bin i.
- */
- private double pBminus(int i) {
- return i == 0 ? 0 : upperBounds[i - 1];
- }
-
- /**
- * Mass of bin i under the within-bin kernel of the bin.
- *
- * @param i index of the bin
- * @return the difference in the within-bin kernel cdf between the
- * upper and lower endpoints of bin i
- */
- private double kB(int i) {
- final double[] binBounds = getUpperBounds();
- final RealDistribution kernel = getKernel(binStats.get(i));
- return i == 0 ? kernel.probability(min, binBounds[0]) :
- kernel.probability(binBounds[i - 1], binBounds[i]);
- }
-
- /**
- * The within-bin kernel of the bin that x belongs to.
- *
- * @param x the value to locate within a bin
- * @return the within-bin kernel of the bin containing x
- */
- private RealDistribution k(double x) {
- final int binIndex = findBin(x);
- return getKernel(binStats.get(binIndex));
- }
-
- /**
- * The combined probability of the bins up to and including binIndex.
- *
- * @param binIndex maximum bin index
- * @return sum of the probabilities of bins through binIndex
- */
- private double cumBinP(int binIndex) {
- return upperBounds[binIndex];
- }
-
- /**
- * The within-bin smoothing kernel. Returns a Gaussian distribution
- * parameterized by {@code bStats}, unless the bin contains only one
- * observation, in which case a constant distribution is returned.
- *
- * @param bStats summary statistics for the bin
- * @return within-bin kernel parameterized by bStats
- */
- protected RealDistribution getKernel(SummaryStatistics bStats) {
- if (bStats.getN() == 1 || bStats.getVariance() == 0) {
- return new ConstantRealDistribution(bStats.getMean());
- } else {
- return new NormalDistribution(bStats.getMean(), bStats.getStandardDeviation(),
- NormalDistribution.DEFAULT_INVERSE_ABSOLUTE_ACCURACY);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/commons-math/blob/ccba0cfc/src/test/java/org/apache/commons/math4/distribution/EmpiricalDistributionTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/math4/distribution/EmpiricalDistributionTest.java b/src/test/java/org/apache/commons/math4/distribution/EmpiricalDistributionTest.java
new file mode 100644
index 0000000..1251ed3
--- /dev/null
+++ b/src/test/java/org/apache/commons/math4/distribution/EmpiricalDistributionTest.java
@@ -0,0 +1,556 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.math4.distribution;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import org.apache.commons.math4.TestUtils;
+import org.apache.commons.math4.analysis.UnivariateFunction;
+import org.apache.commons.math4.analysis.integration.BaseAbstractUnivariateIntegrator;
+import org.apache.commons.math4.analysis.integration.IterativeLegendreGaussIntegrator;
+import org.apache.commons.math4.exception.MathIllegalStateException;
+import org.apache.commons.math4.exception.NullArgumentException;
+import org.apache.commons.math4.exception.NotStrictlyPositiveException;
+import org.apache.commons.math4.rng.RandomSource;
+import org.apache.commons.math4.stat.descriptive.SummaryStatistics;
+import org.apache.commons.math4.util.FastMath;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Test cases for the {@link EmpiricalDistribution} class.
+ */
+public final class EmpiricalDistributionTest extends RealDistributionAbstractTest {
+
+ protected EmpiricalDistribution empiricalDistribution = null;
+ protected EmpiricalDistribution empiricalDistribution2 = null;
+ protected File file = null;
+ protected URL url = null;
+ protected double[] dataArray = null;
+ protected final int n = 10000;
+
+ @Override
+ @Before
+ public void setUp() {
+ super.setUp();
+ empiricalDistribution = new EmpiricalDistribution(100);
+ url = getClass().getResource("testData.txt");
+ final ArrayList list = new ArrayList();
+ try {
+ empiricalDistribution2 = new EmpiricalDistribution(100);
+ BufferedReader in =
+ new BufferedReader(new InputStreamReader(
+ url.openStream()));
+ String str = null;
+ while ((str = in.readLine()) != null) {
+ list.add(Double.valueOf(str));
+ }
+ in.close();
+ in = null;
+ } catch (IOException ex) {
+ Assert.fail("IOException " + ex);
+ }
+
+ dataArray = new double[list.size()];
+ int i = 0;
+ for (Double data : list) {
+ dataArray[i] = data.doubleValue();
+ i++;
+ }
+ }
+
+ // MATH-1279
+ @Test(expected=NotStrictlyPositiveException.class)
+ public void testPrecondition1() {
+ new EmpiricalDistribution(0);
+ }
+
+ /**
+ * Test EmpiricalDistrbution.load() using sample data file.
+ * Check that the sampleCount, mu and sigma match data in
+ * the sample data file. Also verify that load is idempotent.
+ */
+ @Test
+ public void testLoad() throws Exception {
+ // Load from a URL
+ empiricalDistribution.load(url);
+ checkDistribution();
+
+ // Load again from a file (also verifies idempotency of load)
+ File file = new File(url.toURI());
+ empiricalDistribution.load(file);
+ checkDistribution();
+ }
+
+ private void checkDistribution() {
+ // testData File has 10000 values, with mean ~ 5.0, std dev ~ 1
+ // Make sure that loaded distribution matches this
+ Assert.assertEquals(empiricalDistribution.getSampleStats().getN(),1000,10E-7);
+ //TODO: replace with statistical tests
+ Assert.assertEquals(empiricalDistribution.getSampleStats().getMean(),
+ 5.069831575018909,10E-7);
+ Assert.assertEquals(empiricalDistribution.getSampleStats().getStandardDeviation(),
+ 1.0173699343977738,10E-7);
+ }
+
+ /**
+ * Test EmpiricalDistrbution.load(double[]) using data taken from
+ * sample data file.
+ * Check that the sampleCount, mu and sigma match data in
+ * the sample data file.
+ */
+ @Test
+ public void testDoubleLoad() throws Exception {
+ empiricalDistribution2.load(dataArray);
+ // testData File has 10000 values, with mean ~ 5.0, std dev ~ 1
+ // Make sure that loaded distribution matches this
+ Assert.assertEquals(empiricalDistribution2.getSampleStats().getN(),1000,10E-7);
+ //TODO: replace with statistical tests
+ Assert.assertEquals(empiricalDistribution2.getSampleStats().getMean(),
+ 5.069831575018909,10E-7);
+ Assert.assertEquals(empiricalDistribution2.getSampleStats().getStandardDeviation(),
+ 1.0173699343977738,10E-7);
+
+ double[] bounds = empiricalDistribution2.getGeneratorUpperBounds();
+ Assert.assertEquals(bounds.length, 100);
+ Assert.assertEquals(bounds[99], 1.0, 10e-12);
+
+ }
+
+ /**
+ * Generate 1000 random values and make sure they look OK.
+ * Note that there is a non-zero (but very small) probability that
+ * these tests will fail even if the code is working as designed.
+ */
+ @Test
+ public void testNext() throws Exception {
+ tstGen(0.1);
+ tstDoubleGen(0.1);
+ }
+
+ /**
+ * Make sure exception thrown if sampling is attempted
+ * before loading empiricalDistribution.
+ */
+ @Test
+ public void testNextFail1() {
+ try {
+ empiricalDistribution.createSampler(RandomSource.create(RandomSource.JDK)).sample();
+ Assert.fail("Expecting MathIllegalStateException");
+ } catch (MathIllegalStateException ex) {
+ // expected
+ }
+ }
+
+ /**
+ * Make sure exception thrown if sampling is attempted
+ * before loading empiricalDistribution.
+ */
+ @Test
+ public void testNextFail2() {
+ try {
+ empiricalDistribution2.createSampler(RandomSource.create(RandomSource.JDK)).sample();
+ Assert.fail("Expecting MathIllegalStateException");
+ } catch (MathIllegalStateException ex) {
+ // expected
+ }
+ }
+
+ /**
+ * Make sure we can handle a grid size that is too fine
+ */
+ @Test
+ public void testGridTooFine() throws Exception {
+ empiricalDistribution = new EmpiricalDistribution(1001);
+ tstGen(0.1);
+ empiricalDistribution2 = new EmpiricalDistribution(1001);
+ tstDoubleGen(0.1);
+ }
+
+ /**
+ * How about too fat?
+ */
+ @Test
+ public void testGridTooFat() throws Exception {
+ empiricalDistribution = new EmpiricalDistribution(1);
+ tstGen(5); // ridiculous tolerance; but ridiculous grid size
+ // really just checking to make sure we do not bomb
+ empiricalDistribution2 = new EmpiricalDistribution(1);
+ tstDoubleGen(5);
+ }
+
+ /**
+ * Test bin index overflow problem (BZ 36450)
+ */
+ @Test
+ public void testBinIndexOverflow() throws Exception {
+ double[] x = new double[] {9474.94326071674, 2080107.8865462579};
+ new EmpiricalDistribution().load(x);
+ }
+
+ @Test
+ public void testSerialization() {
+ // Empty
+ EmpiricalDistribution dist = new EmpiricalDistribution();
+ EmpiricalDistribution dist2 = (EmpiricalDistribution) TestUtils.serializeAndRecover(dist);
+ verifySame(dist, dist2);
+
+ // Loaded
+ empiricalDistribution2.load(dataArray);
+ dist2 = (EmpiricalDistribution) TestUtils.serializeAndRecover(empiricalDistribution2);
+ verifySame(empiricalDistribution2, dist2);
+ }
+
+ @Test(expected=NullArgumentException.class)
+ public void testLoadNullDoubleArray() {
+ new EmpiricalDistribution().load((double[]) null);
+ }
+
+ @Test(expected=NullArgumentException.class)
+ public void testLoadNullURL() throws Exception {
+ new EmpiricalDistribution().load((URL) null);
+ }
+
+ @Test(expected=NullArgumentException.class)
+ public void testLoadNullFile() throws Exception {
+ new EmpiricalDistribution().load((File) null);
+ }
+
+ /**
+ * MATH-298
+ */
+ @Test
+ public void testGetBinUpperBounds() {
+ double[] testData = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 8, 9, 10};
+ EmpiricalDistribution dist = new EmpiricalDistribution(5);
+ dist.load(testData);
+ double[] expectedBinUpperBounds = {2, 4, 6, 8, 10};
+ double[] expectedGeneratorUpperBounds = {4d/13d, 7d/13d, 9d/13d, 11d/13d, 1};
+ double tol = 10E-12;
+ TestUtils.assertEquals(expectedBinUpperBounds, dist.getUpperBounds(), tol);
+ TestUtils.assertEquals(expectedGeneratorUpperBounds, dist.getGeneratorUpperBounds(), tol);
+ }
+
+ private void verifySame(EmpiricalDistribution d1, EmpiricalDistribution d2) {
+ Assert.assertEquals(d1.isLoaded(), d2.isLoaded());
+ Assert.assertEquals(d1.getBinCount(), d2.getBinCount());
+ Assert.assertEquals(d1.getSampleStats(), d2.getSampleStats());
+ if (d1.isLoaded()) {
+ for (int i = 0; i < d1.getUpperBounds().length; i++) {
+ Assert.assertEquals(d1.getUpperBounds()[i], d2.getUpperBounds()[i], 0);
+ }
+ Assert.assertEquals(d1.getBinStats(), d2.getBinStats());
+ }
+ }
+
+ private void tstGen(double tolerance)throws Exception {
+ empiricalDistribution.load(url);
+ RealDistribution.Sampler sampler
+ = empiricalDistribution.createSampler(RandomSource.create(RandomSource.WELL_19937_C, 1000));
+ SummaryStatistics stats = new SummaryStatistics();
+ for (int i = 1; i < 1000; i++) {
+ stats.addValue(sampler.sample());
+ }
+ Assert.assertEquals("mean", 5.069831575018909, stats.getMean(),tolerance);
+ Assert.assertEquals("std dev", 1.0173699343977738, stats.getStandardDeviation(),tolerance);
+ }
+
+ private void tstDoubleGen(double tolerance)throws Exception {
+ empiricalDistribution2.load(dataArray);
+ RealDistribution.Sampler sampler
+ = empiricalDistribution2.createSampler(RandomSource.create(RandomSource.WELL_19937_C, 1000));
+ SummaryStatistics stats = new SummaryStatistics();
+ for (int i = 1; i < 1000; i++) {
+ stats.addValue(sampler.sample());
+ }
+ Assert.assertEquals("mean", 5.069831575018909, stats.getMean(), tolerance);
+ Assert.assertEquals("std dev", 1.0173699343977738, stats.getStandardDeviation(), tolerance);
+ }
+
+ // Setup for distribution tests
+
+ @Override
+ public RealDistribution makeDistribution() {
+ // Create a uniform distribution on [0, 10,000]
+ final double[] sourceData = new double[n + 1];
+ for (int i = 0; i < n + 1; i++) {
+ sourceData[i] = i;
+ }
+ EmpiricalDistribution dist = new EmpiricalDistribution();
+ dist.load(sourceData);
+ return dist;
+ }
+
+ /** Uniform bin mass = 10/10001 == mass of all but the first bin */
+ private final double binMass = 10d / (n + 1);
+
+ /** Mass of first bin = 11/10001 */
+ private final double firstBinMass = 11d / (n + 1);
+
+ @Override
+ public double[] makeCumulativeTestPoints() {
+ final double[] testPoints = new double[] {9, 10, 15, 1000, 5004, 9999};
+ return testPoints;
+ }
+
+
+ @Override
+ public double[] makeCumulativeTestValues() {
+ /*
+ * Bins should be [0, 10], (10, 20], ..., (9990, 10000]
+ * Kernels should be N(4.5, 3.02765), N(14.5, 3.02765)...
+ * Each bin should have mass 10/10000 = .001
+ */
+ final double[] testPoints = getCumulativeTestPoints();
+ final double[] cumValues = new double[testPoints.length];
+ final EmpiricalDistribution empiricalDistribution = (EmpiricalDistribution) makeDistribution();
+ final double[] binBounds = empiricalDistribution.getUpperBounds();
+ for (int i = 0; i < testPoints.length; i++) {
+ final int bin = findBin(testPoints[i]);
+ final double lower = bin == 0 ? empiricalDistribution.getSupportLowerBound() :
+ binBounds[bin - 1];
+ final double upper = binBounds[bin];
+ // Compute bMinus = sum or mass of bins below the bin containing the point
+ // First bin has mass 11 / 10000, the rest have mass 10 / 10000.
+ final double bMinus = bin == 0 ? 0 : (bin - 1) * binMass + firstBinMass;
+ final RealDistribution kernel = findKernel(lower, upper);
+ final double withinBinKernelMass = kernel.probability(lower, upper);
+ final double kernelCum = kernel.probability(lower, testPoints[i]);
+ cumValues[i] = bMinus + (bin == 0 ? firstBinMass : binMass) * kernelCum/withinBinKernelMass;
+ }
+ return cumValues;
+ }
+
+ @Override
+ public double[] makeDensityTestValues() {
+ final double[] testPoints = getCumulativeTestPoints();
+ final double[] densityValues = new double[testPoints.length];
+ final EmpiricalDistribution empiricalDistribution = (EmpiricalDistribution) makeDistribution();
+ final double[] binBounds = empiricalDistribution.getUpperBounds();
+ for (int i = 0; i < testPoints.length; i++) {
+ final int bin = findBin(testPoints[i]);
+ final double lower = bin == 0 ? empiricalDistribution.getSupportLowerBound() :
+ binBounds[bin - 1];
+ final double upper = binBounds[bin];
+ final RealDistribution kernel = findKernel(lower, upper);
+ final double withinBinKernelMass = kernel.probability(lower, upper);
+ final double density = kernel.density(testPoints[i]);
+ densityValues[i] = density * (bin == 0 ? firstBinMass : binMass) / withinBinKernelMass;
+ }
+ return densityValues;
+ }
+
+ /**
+ * Modify test integration bounds from the default. Because the distribution
+ * has discontinuities at bin boundaries, integrals spanning multiple bins
+ * will face convergence problems. Only test within-bin integrals and spans
+ * across no more than 3 bin boundaries.
+ */
+ @Override
+ @Test
+ public void testDensityIntegrals() {
+ final RealDistribution distribution = makeDistribution();
+ final double tol = 1.0e-9;
+ final BaseAbstractUnivariateIntegrator integrator =
+ new IterativeLegendreGaussIntegrator(5, 1.0e-12, 1.0e-10);
+ final UnivariateFunction d = new UnivariateFunction() {
+ @Override
+ public double value(double x) {
+ return distribution.density(x);
+ }
+ };
+ final double[] lower = {0, 5, 1000, 5001, 9995};
+ final double[] upper = {5, 12, 1030, 5010, 10000};
+ for (int i = 1; i < 5; i++) {
+ Assert.assertEquals(
+ distribution.probability(
+ lower[i], upper[i]),
+ integrator.integrate(
+ 1000000, // Triangle integrals are very slow to converge
+ d, lower[i], upper[i]), tol);
+ }
+ }
+
+ /**
+ * MATH-984
+ * Verify that sampled values do not go outside of the range of the data.
+ */
+ @Test
+ public void testSampleValuesRange() {
+ // Concentrate values near the endpoints of (0, 1).
+ // Unconstrained Gaussian kernel would generate values outside the interval.
+ final double[] data = new double[100];
+ for (int i = 0; i < 50; i++) {
+ data[i] = 1 / ((double) i + 1);
+ }
+ for (int i = 51; i < 100; i++) {
+ data[i] = 1 - 1 / (100 - (double) i + 2);
+ }
+ EmpiricalDistribution dist = new EmpiricalDistribution(10);
+ dist.load(data);
+ RealDistribution.Sampler sampler
+ = dist.createSampler(RandomSource.create(RandomSource.WELL_19937_C, 1000));
+ for (int i = 0; i < 1000; i++) {
+ final double dev = sampler.sample();
+ Assert.assertTrue(dev < 1);
+ Assert.assertTrue(dev > 0);
+ }
+ }
+
+ /**
+ * MATH-1203, MATH-1208
+ */
+ @Test
+ public void testNoBinVariance() {
+ final double[] data = {0, 0, 1, 1};
+ EmpiricalDistribution dist = new EmpiricalDistribution(2);
+ dist.load(data);
+ RealDistribution.Sampler sampler
+ = dist.createSampler(RandomSource.create(RandomSource.WELL_19937_C, 1000));
+ for (int i = 0; i < 1000; i++) {
+ final double dev = sampler.sample();
+ Assert.assertTrue(dev == 0 || dev == 1);
+ }
+ Assert.assertEquals(0.5, dist.cumulativeProbability(0), Double.MIN_VALUE);
+ Assert.assertEquals(1.0, dist.cumulativeProbability(1), Double.MIN_VALUE);
+ Assert.assertEquals(0.5, dist.cumulativeProbability(0.5), Double.MIN_VALUE);
+ Assert.assertEquals(0.5, dist.cumulativeProbability(0.7), Double.MIN_VALUE);
+ }
+
+ /**
+ * Find the bin that x belongs (relative to {@link #makeDistribution()}).
+ */
+ private int findBin(double x) {
+ // Number of bins below x should be trunc(x/10)
+ final double nMinus = FastMath.floor(x / 10);
+ final int bin = (int) FastMath.round(nMinus);
+ // If x falls on a bin boundary, it is in the lower bin
+ return FastMath.floor(x / 10) == x / 10 ? bin - 1 : bin;
+ }
+
+ /**
+ * Find the within-bin kernel for the bin with lower bound lower
+ * and upper bound upper. All bins other than the first contain 10 points
+ * exclusive of the lower bound and are centered at (lower + upper + 1) / 2.
+ * The first bin includes its lower bound, 0, so has different mean and
+ * standard deviation.
+ */
+ private RealDistribution findKernel(double lower, double upper) {
+ if (lower < 1) {
+ return new NormalDistribution(5d, 3.3166247903554);
+ } else {
+ return new NormalDistribution((upper + lower + 1) / 2d, 3.0276503540974917);
+ }
+ }
+
+ @Test
+ public void testKernelOverrideConstant() {
+ final EmpiricalDistribution dist = new ConstantKernelEmpiricalDistribution(5);
+ final double[] data = {1d,2d,3d, 4d,5d,6d, 7d,8d,9d, 10d,11d,12d, 13d,14d,15d};
+ dist.load(data);
+ RealDistribution.Sampler sampler
+ = dist.createSampler(RandomSource.create(RandomSource.WELL_19937_C, 1000));
+ // Bin masses concentrated on 2, 5, 8, 11, 14 <- effectively discrete uniform distribution over these
+ double[] values = {2d, 5d, 8d, 11d, 14d};
+ for (int i = 0; i < 20; i++) {
+ Assert.assertTrue(Arrays.binarySearch(values, sampler.sample()) >= 0);
+ }
+ final double tol = 10E-12;
+ Assert.assertEquals(0.0, dist.cumulativeProbability(1), tol);
+ Assert.assertEquals(0.2, dist.cumulativeProbability(2), tol);
+ Assert.assertEquals(0.6, dist.cumulativeProbability(10), tol);
+ Assert.assertEquals(0.8, dist.cumulativeProbability(12), tol);
+ Assert.assertEquals(0.8, dist.cumulativeProbability(13), tol);
+ Assert.assertEquals(1.0, dist.cumulativeProbability(15), tol);
+
+ Assert.assertEquals(2.0, dist.inverseCumulativeProbability(0.1), tol);
+ Assert.assertEquals(2.0, dist.inverseCumulativeProbability(0.2), tol);
+ Assert.assertEquals(5.0, dist.inverseCumulativeProbability(0.3), tol);
+ Assert.assertEquals(5.0, dist.inverseCumulativeProbability(0.4), tol);
+ Assert.assertEquals(8.0, dist.inverseCumulativeProbability(0.5), tol);
+ Assert.assertEquals(8.0, dist.inverseCumulativeProbability(0.6), tol);
+ }
+
+ @Test
+ public void testKernelOverrideUniform() {
+ final EmpiricalDistribution dist = new UniformKernelEmpiricalDistribution(5);
+ final double[] data = {1d,2d,3d, 4d,5d,6d, 7d,8d,9d, 10d,11d,12d, 13d,14d,15d};
+ dist.load(data);
+ RealDistribution.Sampler sampler
+ = dist.createSampler(RandomSource.create(RandomSource.WELL_19937_C, 1000));
+ // Kernels are uniform distributions on [1,3], [4,6], [7,9], [10,12], [13,15]
+ final double bounds[] = {3d, 6d, 9d, 12d};
+ final double tol = 10E-12;
+ for (int i = 0; i < 20; i++) {
+ final double v = sampler.sample();
+ // Make sure v is not in the excluded range between bins - that is (bounds[i], bounds[i] + 1)
+ for (int j = 0; j < bounds.length; j++) {
+ Assert.assertFalse(v > bounds[j] + tol && v < bounds[j] + 1 - tol);
+ }
+ }
+ Assert.assertEquals(0.0, dist.cumulativeProbability(1), tol);
+ Assert.assertEquals(0.1, dist.cumulativeProbability(2), tol);
+ Assert.assertEquals(0.6, dist.cumulativeProbability(10), tol);
+ Assert.assertEquals(0.8, dist.cumulativeProbability(12), tol);
+ Assert.assertEquals(0.8, dist.cumulativeProbability(13), tol);
+ Assert.assertEquals(1.0, dist.cumulativeProbability(15), tol);
+
+ Assert.assertEquals(2.0, dist.inverseCumulativeProbability(0.1), tol);
+ Assert.assertEquals(3.0, dist.inverseCumulativeProbability(0.2), tol);
+ Assert.assertEquals(5.0, dist.inverseCumulativeProbability(0.3), tol);
+ Assert.assertEquals(6.0, dist.inverseCumulativeProbability(0.4), tol);
+ Assert.assertEquals(8.0, dist.inverseCumulativeProbability(0.5), tol);
+ Assert.assertEquals(9.0, dist.inverseCumulativeProbability(0.6), tol);
+ }
+
+
+ /**
+ * Empirical distribution using a constant smoothing kernel.
+ */
+ private class ConstantKernelEmpiricalDistribution extends EmpiricalDistribution {
+ private static final long serialVersionUID = 1L;
+ public ConstantKernelEmpiricalDistribution(int i) {
+ super(i);
+ }
+ // Use constant distribution equal to bin mean within bin
+ @Override
+ protected RealDistribution getKernel(SummaryStatistics bStats) {
+ return new ConstantRealDistribution(bStats.getMean());
+ }
+ }
+
+ /**
+ * Empirical distribution using a uniform smoothing kernel.
+ */
+ private class UniformKernelEmpiricalDistribution extends EmpiricalDistribution {
+ private static final long serialVersionUID = 2963149194515159653L;
+ public UniformKernelEmpiricalDistribution(int i) {
+ super(i);
+ }
+ @Override
+ protected RealDistribution getKernel(SummaryStatistics bStats) {
+ return new UniformRealDistribution(bStats.getMin(), bStats.getMax());
+ }
+ }
+}