spark-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From m...@apache.org
Subject spark git commit: [SPARK-5477] refactor stat.py
Date Thu, 29 Jan 2015 18:11:47 GMT
Repository: spark
Updated Branches:
  refs/heads/master 5ad78f620 -> a3dc61848


[SPARK-5477] refactor stat.py

There is only a single `stat.py` file for the `mllib.stat` package. We recently added `MultivariateGaussian`
under `mllib.stat.distribution` in Scala/Java. It would be nice to refactor `stat.py` and
make it easy to expand. Note that `ChiSqTestResult` is moved from `mllib.stat` to `mllib.stat.test`.
The latter is used in Scala/Java. It is only used in the return value of `Statistics.chiSqTest`,
so this should be an okay change.

davies

Author: Xiangrui Meng <meng@databricks.com>

Closes #4266 from mengxr/py-stat-refactor and squashes the following commits:

1a5e1db [Xiangrui Meng] refactor stat.py


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a3dc6184
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a3dc6184
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a3dc6184

Branch: refs/heads/master
Commit: a3dc6184862345c459d1fba475b1c9210038a913
Parents: 5ad78f6
Author: Xiangrui Meng <meng@databricks.com>
Authored: Thu Jan 29 10:11:44 2015 -0800
Committer: Xiangrui Meng <meng@databricks.com>
Committed: Thu Jan 29 10:11:44 2015 -0800

----------------------------------------------------------------------
 mllib/pom.xml                            |   1 +
 python/pyspark/mllib/stat.py             | 298 --------------------------
 python/pyspark/mllib/stat/__init__.py    |  24 +++
 python/pyspark/mllib/stat/_statistics.py | 247 +++++++++++++++++++++
 python/pyspark/mllib/stat/test.py        |  69 ++++++
 python/run-tests                         |   2 +-
 6 files changed, 342 insertions(+), 299 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/a3dc6184/mllib/pom.xml
----------------------------------------------------------------------
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 7b7beaf..fc2b2cc 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -125,6 +125,7 @@
         <directory>../python</directory>
         <includes>
           <include>pyspark/mllib/*.py</include>
+          <include>pyspark/mllib/stat/*.py</include>
           <include>pyspark/ml/*.py</include>
           <include>pyspark/ml/param/*.py</include>
         </includes>

http://git-wip-us.apache.org/repos/asf/spark/blob/a3dc6184/python/pyspark/mllib/stat.py
----------------------------------------------------------------------
diff --git a/python/pyspark/mllib/stat.py b/python/pyspark/mllib/stat.py
deleted file mode 100644
index c8af777..0000000
--- a/python/pyspark/mllib/stat.py
+++ /dev/null
@@ -1,298 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""
-Python package for statistical functions in MLlib.
-"""
-
-from pyspark import RDD
-from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
-from pyspark.mllib.linalg import Matrix, _convert_to_vector
-from pyspark.mllib.regression import LabeledPoint
-
-
-__all__ = ['MultivariateStatisticalSummary', 'ChiSqTestResult', 'Statistics']
-
-
-class MultivariateStatisticalSummary(JavaModelWrapper):
-
-    """
-    Trait for multivariate statistical summary of a data matrix.
-    """
-
-    def mean(self):
-        return self.call("mean").toArray()
-
-    def variance(self):
-        return self.call("variance").toArray()
-
-    def count(self):
-        return self.call("count")
-
-    def numNonzeros(self):
-        return self.call("numNonzeros").toArray()
-
-    def max(self):
-        return self.call("max").toArray()
-
-    def min(self):
-        return self.call("min").toArray()
-
-
-class ChiSqTestResult(JavaModelWrapper):
-    """
-    .. note:: Experimental
-
-    Object containing the test results for the chi-squared hypothesis test.
-    """
-    @property
-    def method(self):
-        """
-        Name of the test method
-        """
-        return self._java_model.method()
-
-    @property
-    def pValue(self):
-        """
-        The probability of obtaining a test statistic result at least as
-        extreme as the one that was actually observed, assuming that the
-        null hypothesis is true.
-        """
-        return self._java_model.pValue()
-
-    @property
-    def degreesOfFreedom(self):
-        """
-        Returns the degree(s) of freedom of the hypothesis test.
-        Return type should be Number(e.g. Int, Double) or tuples of Numbers.
-        """
-        return self._java_model.degreesOfFreedom()
-
-    @property
-    def statistic(self):
-        """
-        Test statistic.
-        """
-        return self._java_model.statistic()
-
-    @property
-    def nullHypothesis(self):
-        """
-        Null hypothesis of the test.
-        """
-        return self._java_model.nullHypothesis()
-
-    def __str__(self):
-        return self._java_model.toString()
-
-
-class Statistics(object):
-
-    @staticmethod
-    def colStats(rdd):
-        """
-        Computes column-wise summary statistics for the input RDD[Vector].
-
-        :param rdd: an RDD[Vector] for which column-wise summary statistics
-                    are to be computed.
-        :return: :class:`MultivariateStatisticalSummary` object containing
-                 column-wise summary statistics.
-
-        >>> from pyspark.mllib.linalg import Vectors
-        >>> rdd = sc.parallelize([Vectors.dense([2, 0, 0, -2]),
-        ...                       Vectors.dense([4, 5, 0,  3]),
-        ...                       Vectors.dense([6, 7, 0,  8])])
-        >>> cStats = Statistics.colStats(rdd)
-        >>> cStats.mean()
-        array([ 4.,  4.,  0.,  3.])
-        >>> cStats.variance()
-        array([  4.,  13.,   0.,  25.])
-        >>> cStats.count()
-        3L
-        >>> cStats.numNonzeros()
-        array([ 3.,  2.,  0.,  3.])
-        >>> cStats.max()
-        array([ 6.,  7.,  0.,  8.])
-        >>> cStats.min()
-        array([ 2.,  0.,  0., -2.])
-        """
-        cStats = callMLlibFunc("colStats", rdd.map(_convert_to_vector))
-        return MultivariateStatisticalSummary(cStats)
-
-    @staticmethod
-    def corr(x, y=None, method=None):
-        """
-        Compute the correlation (matrix) for the input RDD(s) using the
-        specified method.
-        Methods currently supported: I{pearson (default), spearman}.
-
-        If a single RDD of Vectors is passed in, a correlation matrix
-        comparing the columns in the input RDD is returned. Use C{method=}
-        to specify the method to be used for single RDD inout.
-        If two RDDs of floats are passed in, a single float is returned.
-
-        :param x: an RDD of vector for which the correlation matrix is to be computed,
-                  or an RDD of float of the same cardinality as y when y is specified.
-        :param y: an RDD of float of the same cardinality as x.
-        :param method: String specifying the method to use for computing correlation.
-                       Supported: `pearson` (default), `spearman`
-        :return: Correlation matrix comparing columns in x.
-
-        >>> x = sc.parallelize([1.0, 0.0, -2.0], 2)
-        >>> y = sc.parallelize([4.0, 5.0, 3.0], 2)
-        >>> zeros = sc.parallelize([0.0, 0.0, 0.0], 2)
-        >>> abs(Statistics.corr(x, y) - 0.6546537) < 1e-7
-        True
-        >>> Statistics.corr(x, y) == Statistics.corr(x, y, "pearson")
-        True
-        >>> Statistics.corr(x, y, "spearman")
-        0.5
-        >>> from math import isnan
-        >>> isnan(Statistics.corr(x, zeros))
-        True
-        >>> from pyspark.mllib.linalg import Vectors
-        >>> rdd = sc.parallelize([Vectors.dense([1, 0, 0, -2]), Vectors.dense([4,
5, 0, 3]),
-        ...                       Vectors.dense([6, 7, 0,  8]), Vectors.dense([9, 0, 0, 1])])
-        >>> pearsonCorr = Statistics.corr(rdd)
-        >>> print str(pearsonCorr).replace('nan', 'NaN')
-        [[ 1.          0.05564149         NaN  0.40047142]
-         [ 0.05564149  1.                 NaN  0.91359586]
-         [        NaN         NaN  1.                 NaN]
-         [ 0.40047142  0.91359586         NaN  1.        ]]
-        >>> spearmanCorr = Statistics.corr(rdd, method="spearman")
-        >>> print str(spearmanCorr).replace('nan', 'NaN')
-        [[ 1.          0.10540926         NaN  0.4       ]
-         [ 0.10540926  1.                 NaN  0.9486833 ]
-         [        NaN         NaN  1.                 NaN]
-         [ 0.4         0.9486833          NaN  1.        ]]
-        >>> try:
-        ...     Statistics.corr(rdd, "spearman")
-        ...     print "Method name as second argument without 'method=' shouldn't be allowed."
-        ... except TypeError:
-        ...     pass
-        """
-        # Check inputs to determine whether a single value or a matrix is needed for output.
-        # Since it's legal for users to use the method name as the second argument, we need
to
-        # check if y is used to specify the method name instead.
-        if type(y) == str:
-            raise TypeError("Use 'method=' to specify method name.")
-
-        if not y:
-            return callMLlibFunc("corr", x.map(_convert_to_vector), method).toArray()
-        else:
-            return callMLlibFunc("corr", x.map(float), y.map(float), method)
-
-    @staticmethod
-    def chiSqTest(observed, expected=None):
-        """
-        .. note:: Experimental
-
-        If `observed` is Vector, conduct Pearson's chi-squared goodness
-        of fit test of the observed data against the expected distribution,
-        or againt the uniform distribution (by default), with each category
-        having an expected frequency of `1 / len(observed)`.
-        (Note: `observed` cannot contain negative values)
-
-        If `observed` is matrix, conduct Pearson's independence test on the
-        input contingency matrix, which cannot contain negative entries or
-        columns or rows that sum up to 0.
-
-        If `observed` is an RDD of LabeledPoint, conduct Pearson's independence
-        test for every feature against the label across the input RDD.
-        For each feature, the (feature, label) pairs are converted into a
-        contingency matrix for which the chi-squared statistic is computed.
-        All label and feature values must be categorical.
-
-        :param observed: it could be a vector containing the observed categorical
-                         counts/relative frequencies, or the contingency matrix
-                         (containing either counts or relative frequencies),
-                         or an RDD of LabeledPoint containing the labeled dataset
-                         with categorical features. Real-valued features will be
-                         treated as categorical for each distinct value.
-        :param expected: Vector containing the expected categorical counts/relative
-                         frequencies. `expected` is rescaled if the `expected` sum
-                         differs from the `observed` sum.
-        :return: ChiSquaredTest object containing the test statistic, degrees
-                 of freedom, p-value, the method used, and the null hypothesis.
-
-        >>> from pyspark.mllib.linalg import Vectors, Matrices
-        >>> observed = Vectors.dense([4, 6, 5])
-        >>> pearson = Statistics.chiSqTest(observed)
-        >>> print pearson.statistic
-        0.4
-        >>> pearson.degreesOfFreedom
-        2
-        >>> print round(pearson.pValue, 4)
-        0.8187
-        >>> pearson.method
-        u'pearson'
-        >>> pearson.nullHypothesis
-        u'observed follows the same distribution as expected.'
-
-        >>> observed = Vectors.dense([21, 38, 43, 80])
-        >>> expected = Vectors.dense([3, 5, 7, 20])
-        >>> pearson = Statistics.chiSqTest(observed, expected)
-        >>> print round(pearson.pValue, 4)
-        0.0027
-
-        >>> data = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0,
12.0]
-        >>> chi = Statistics.chiSqTest(Matrices.dense(3, 4, data))
-        >>> print round(chi.statistic, 4)
-        21.9958
-
-        >>> data = [LabeledPoint(0.0, Vectors.dense([0.5, 10.0])),
-        ...         LabeledPoint(0.0, Vectors.dense([1.5, 20.0])),
-        ...         LabeledPoint(1.0, Vectors.dense([1.5, 30.0])),
-        ...         LabeledPoint(0.0, Vectors.dense([3.5, 30.0])),
-        ...         LabeledPoint(0.0, Vectors.dense([3.5, 40.0])),
-        ...         LabeledPoint(1.0, Vectors.dense([3.5, 40.0])),]
-        >>> rdd = sc.parallelize(data, 4)
-        >>> chi = Statistics.chiSqTest(rdd)
-        >>> print chi[0].statistic
-        0.75
-        >>> print chi[1].statistic
-        1.5
-        """
-        if isinstance(observed, RDD):
-            if not isinstance(observed.first(), LabeledPoint):
-                raise ValueError("observed should be an RDD of LabeledPoint")
-            jmodels = callMLlibFunc("chiSqTest", observed)
-            return [ChiSqTestResult(m) for m in jmodels]
-
-        if isinstance(observed, Matrix):
-            jmodel = callMLlibFunc("chiSqTest", observed)
-        else:
-            if expected and len(expected) != len(observed):
-                raise ValueError("`expected` should have same length with `observed`")
-            jmodel = callMLlibFunc("chiSqTest", _convert_to_vector(observed), expected)
-        return ChiSqTestResult(jmodel)
-
-
-def _test():
-    import doctest
-    from pyspark import SparkContext
-    globs = globals().copy()
-    globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
-    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
-    globs['sc'].stop()
-    if failure_count:
-        exit(-1)
-
-
-if __name__ == "__main__":
-    _test()

http://git-wip-us.apache.org/repos/asf/spark/blob/a3dc6184/python/pyspark/mllib/stat/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyspark/mllib/stat/__init__.py b/python/pyspark/mllib/stat/__init__.py
new file mode 100644
index 0000000..799d260
--- /dev/null
+++ b/python/pyspark/mllib/stat/__init__.py
@@ -0,0 +1,24 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Python package for statistical functions in MLlib.
+"""
+
+from pyspark.mllib.stat._statistics import *
+
+__all__ = ["Statistics", "MultivariateStatisticalSummary"]

http://git-wip-us.apache.org/repos/asf/spark/blob/a3dc6184/python/pyspark/mllib/stat/_statistics.py
----------------------------------------------------------------------
diff --git a/python/pyspark/mllib/stat/_statistics.py b/python/pyspark/mllib/stat/_statistics.py
new file mode 100644
index 0000000..218ac14
--- /dev/null
+++ b/python/pyspark/mllib/stat/_statistics.py
@@ -0,0 +1,247 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from pyspark import RDD
+from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
+from pyspark.mllib.linalg import Matrix, _convert_to_vector
+from pyspark.mllib.regression import LabeledPoint
+from pyspark.mllib.stat.test import ChiSqTestResult
+
+
+__all__ = ['MultivariateStatisticalSummary', 'Statistics']
+
+
+class MultivariateStatisticalSummary(JavaModelWrapper):
+
+    """
+    Trait for multivariate statistical summary of a data matrix.
+    """
+
+    def mean(self):
+        return self.call("mean").toArray()
+
+    def variance(self):
+        return self.call("variance").toArray()
+
+    def count(self):
+        return self.call("count")
+
+    def numNonzeros(self):
+        return self.call("numNonzeros").toArray()
+
+    def max(self):
+        return self.call("max").toArray()
+
+    def min(self):
+        return self.call("min").toArray()
+
+
+class Statistics(object):
+
+    @staticmethod
+    def colStats(rdd):
+        """
+        Computes column-wise summary statistics for the input RDD[Vector].
+
+        :param rdd: an RDD[Vector] for which column-wise summary statistics
+                    are to be computed.
+        :return: :class:`MultivariateStatisticalSummary` object containing
+                 column-wise summary statistics.
+
+        >>> from pyspark.mllib.linalg import Vectors
+        >>> rdd = sc.parallelize([Vectors.dense([2, 0, 0, -2]),
+        ...                       Vectors.dense([4, 5, 0,  3]),
+        ...                       Vectors.dense([6, 7, 0,  8])])
+        >>> cStats = Statistics.colStats(rdd)
+        >>> cStats.mean()
+        array([ 4.,  4.,  0.,  3.])
+        >>> cStats.variance()
+        array([  4.,  13.,   0.,  25.])
+        >>> cStats.count()
+        3L
+        >>> cStats.numNonzeros()
+        array([ 3.,  2.,  0.,  3.])
+        >>> cStats.max()
+        array([ 6.,  7.,  0.,  8.])
+        >>> cStats.min()
+        array([ 2.,  0.,  0., -2.])
+        """
+        cStats = callMLlibFunc("colStats", rdd.map(_convert_to_vector))
+        return MultivariateStatisticalSummary(cStats)
+
+    @staticmethod
+    def corr(x, y=None, method=None):
+        """
+        Compute the correlation (matrix) for the input RDD(s) using the
+        specified method.
+        Methods currently supported: I{pearson (default), spearman}.
+
+        If a single RDD of Vectors is passed in, a correlation matrix
+        comparing the columns in the input RDD is returned. Use C{method=}
+        to specify the method to be used for single RDD inout.
+        If two RDDs of floats are passed in, a single float is returned.
+
+        :param x: an RDD of vector for which the correlation matrix is to be computed,
+                  or an RDD of float of the same cardinality as y when y is specified.
+        :param y: an RDD of float of the same cardinality as x.
+        :param method: String specifying the method to use for computing correlation.
+                       Supported: `pearson` (default), `spearman`
+        :return: Correlation matrix comparing columns in x.
+
+        >>> x = sc.parallelize([1.0, 0.0, -2.0], 2)
+        >>> y = sc.parallelize([4.0, 5.0, 3.0], 2)
+        >>> zeros = sc.parallelize([0.0, 0.0, 0.0], 2)
+        >>> abs(Statistics.corr(x, y) - 0.6546537) < 1e-7
+        True
+        >>> Statistics.corr(x, y) == Statistics.corr(x, y, "pearson")
+        True
+        >>> Statistics.corr(x, y, "spearman")
+        0.5
+        >>> from math import isnan
+        >>> isnan(Statistics.corr(x, zeros))
+        True
+        >>> from pyspark.mllib.linalg import Vectors
+        >>> rdd = sc.parallelize([Vectors.dense([1, 0, 0, -2]), Vectors.dense([4,
5, 0, 3]),
+        ...                       Vectors.dense([6, 7, 0,  8]), Vectors.dense([9, 0, 0, 1])])
+        >>> pearsonCorr = Statistics.corr(rdd)
+        >>> print str(pearsonCorr).replace('nan', 'NaN')
+        [[ 1.          0.05564149         NaN  0.40047142]
+         [ 0.05564149  1.                 NaN  0.91359586]
+         [        NaN         NaN  1.                 NaN]
+         [ 0.40047142  0.91359586         NaN  1.        ]]
+        >>> spearmanCorr = Statistics.corr(rdd, method="spearman")
+        >>> print str(spearmanCorr).replace('nan', 'NaN')
+        [[ 1.          0.10540926         NaN  0.4       ]
+         [ 0.10540926  1.                 NaN  0.9486833 ]
+         [        NaN         NaN  1.                 NaN]
+         [ 0.4         0.9486833          NaN  1.        ]]
+        >>> try:
+        ...     Statistics.corr(rdd, "spearman")
+        ...     print "Method name as second argument without 'method=' shouldn't be allowed."
+        ... except TypeError:
+        ...     pass
+        """
+        # Check inputs to determine whether a single value or a matrix is needed for output.
+        # Since it's legal for users to use the method name as the second argument, we need
to
+        # check if y is used to specify the method name instead.
+        if type(y) == str:
+            raise TypeError("Use 'method=' to specify method name.")
+
+        if not y:
+            return callMLlibFunc("corr", x.map(_convert_to_vector), method).toArray()
+        else:
+            return callMLlibFunc("corr", x.map(float), y.map(float), method)
+
+    @staticmethod
+    def chiSqTest(observed, expected=None):
+        """
+        .. note:: Experimental
+
+        If `observed` is Vector, conduct Pearson's chi-squared goodness
+        of fit test of the observed data against the expected distribution,
+        or againt the uniform distribution (by default), with each category
+        having an expected frequency of `1 / len(observed)`.
+        (Note: `observed` cannot contain negative values)
+
+        If `observed` is matrix, conduct Pearson's independence test on the
+        input contingency matrix, which cannot contain negative entries or
+        columns or rows that sum up to 0.
+
+        If `observed` is an RDD of LabeledPoint, conduct Pearson's independence
+        test for every feature against the label across the input RDD.
+        For each feature, the (feature, label) pairs are converted into a
+        contingency matrix for which the chi-squared statistic is computed.
+        All label and feature values must be categorical.
+
+        :param observed: it could be a vector containing the observed categorical
+                         counts/relative frequencies, or the contingency matrix
+                         (containing either counts or relative frequencies),
+                         or an RDD of LabeledPoint containing the labeled dataset
+                         with categorical features. Real-valued features will be
+                         treated as categorical for each distinct value.
+        :param expected: Vector containing the expected categorical counts/relative
+                         frequencies. `expected` is rescaled if the `expected` sum
+                         differs from the `observed` sum.
+        :return: ChiSquaredTest object containing the test statistic, degrees
+                 of freedom, p-value, the method used, and the null hypothesis.
+
+        >>> from pyspark.mllib.linalg import Vectors, Matrices
+        >>> observed = Vectors.dense([4, 6, 5])
+        >>> pearson = Statistics.chiSqTest(observed)
+        >>> print pearson.statistic
+        0.4
+        >>> pearson.degreesOfFreedom
+        2
+        >>> print round(pearson.pValue, 4)
+        0.8187
+        >>> pearson.method
+        u'pearson'
+        >>> pearson.nullHypothesis
+        u'observed follows the same distribution as expected.'
+
+        >>> observed = Vectors.dense([21, 38, 43, 80])
+        >>> expected = Vectors.dense([3, 5, 7, 20])
+        >>> pearson = Statistics.chiSqTest(observed, expected)
+        >>> print round(pearson.pValue, 4)
+        0.0027
+
+        >>> data = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0,
12.0]
+        >>> chi = Statistics.chiSqTest(Matrices.dense(3, 4, data))
+        >>> print round(chi.statistic, 4)
+        21.9958
+
+        >>> data = [LabeledPoint(0.0, Vectors.dense([0.5, 10.0])),
+        ...         LabeledPoint(0.0, Vectors.dense([1.5, 20.0])),
+        ...         LabeledPoint(1.0, Vectors.dense([1.5, 30.0])),
+        ...         LabeledPoint(0.0, Vectors.dense([3.5, 30.0])),
+        ...         LabeledPoint(0.0, Vectors.dense([3.5, 40.0])),
+        ...         LabeledPoint(1.0, Vectors.dense([3.5, 40.0])),]
+        >>> rdd = sc.parallelize(data, 4)
+        >>> chi = Statistics.chiSqTest(rdd)
+        >>> print chi[0].statistic
+        0.75
+        >>> print chi[1].statistic
+        1.5
+        """
+        if isinstance(observed, RDD):
+            if not isinstance(observed.first(), LabeledPoint):
+                raise ValueError("observed should be an RDD of LabeledPoint")
+            jmodels = callMLlibFunc("chiSqTest", observed)
+            return [ChiSqTestResult(m) for m in jmodels]
+
+        if isinstance(observed, Matrix):
+            jmodel = callMLlibFunc("chiSqTest", observed)
+        else:
+            if expected and len(expected) != len(observed):
+                raise ValueError("`expected` should have same length with `observed`")
+            jmodel = callMLlibFunc("chiSqTest", _convert_to_vector(observed), expected)
+        return ChiSqTestResult(jmodel)
+
+
+def _test():
+    import doctest
+    from pyspark import SparkContext
+    globs = globals().copy()
+    globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
+    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
+    globs['sc'].stop()
+    if failure_count:
+        exit(-1)
+
+
+if __name__ == "__main__":
+    _test()

http://git-wip-us.apache.org/repos/asf/spark/blob/a3dc6184/python/pyspark/mllib/stat/test.py
----------------------------------------------------------------------
diff --git a/python/pyspark/mllib/stat/test.py b/python/pyspark/mllib/stat/test.py
new file mode 100644
index 0000000..762506e
--- /dev/null
+++ b/python/pyspark/mllib/stat/test.py
@@ -0,0 +1,69 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from pyspark.mllib.common import JavaModelWrapper
+
+
+__all__ = ["ChiSqTestResult"]
+
+
+class ChiSqTestResult(JavaModelWrapper):
+    """
+    .. note:: Experimental
+
+    Object containing the test results for the chi-squared hypothesis test.
+    """
+    @property
+    def method(self):
+        """
+        Name of the test method
+        """
+        return self._java_model.method()
+
+    @property
+    def pValue(self):
+        """
+        The probability of obtaining a test statistic result at least as
+        extreme as the one that was actually observed, assuming that the
+        null hypothesis is true.
+        """
+        return self._java_model.pValue()
+
+    @property
+    def degreesOfFreedom(self):
+        """
+        Returns the degree(s) of freedom of the hypothesis test.
+        Return type should be Number(e.g. Int, Double) or tuples of Numbers.
+        """
+        return self._java_model.degreesOfFreedom()
+
+    @property
+    def statistic(self):
+        """
+        Test statistic.
+        """
+        return self._java_model.statistic()
+
+    @property
+    def nullHypothesis(self):
+        """
+        Null hypothesis of the test.
+        """
+        return self._java_model.nullHypothesis()
+
+    def __str__(self):
+        return self._java_model.toString()

http://git-wip-us.apache.org/repos/asf/spark/blob/a3dc6184/python/run-tests
----------------------------------------------------------------------
diff --git a/python/run-tests b/python/run-tests
index 84cb89b..e91f1a8 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -76,7 +76,7 @@ function run_mllib_tests() {
     run_test "pyspark/mllib/rand.py"
     run_test "pyspark/mllib/recommendation.py"
     run_test "pyspark/mllib/regression.py"
-    run_test "pyspark/mllib/stat.py"
+    run_test "pyspark/mllib/stat/_statistics.py"
     run_test "pyspark/mllib/tree.py"
     run_test "pyspark/mllib/util.py"
     run_test "pyspark/mllib/tests.py"


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org


Mime
View raw message