spark-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From m...@apache.org
Subject spark git commit: [SPARK-6598][MLLIB] Python API for IDFModel
Date Tue, 31 Mar 2015 18:25:23 GMT
Repository: spark
Updated Branches:
  refs/heads/master cd48ca501 -> 46de6c05e


[SPARK-6598][MLLIB] Python API for IDFModel

This is the sub-task of SPARK-6254.
Wrapping IDFModel `idf` member function for pyspark.

Author: lewuathe <lewuathe@me.com>

Closes #5264 from Lewuathe/SPARK-6598 and squashes the following commits:

1dc522c [lewuathe] [SPARK-6598] Python API for IDFModel


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/46de6c05
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/46de6c05
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/46de6c05

Branch: refs/heads/master
Commit: 46de6c05e0619250346f0988e296849f8f93d2b1
Parents: cd48ca5
Author: lewuathe <lewuathe@me.com>
Authored: Tue Mar 31 11:25:21 2015 -0700
Committer: Xiangrui Meng <meng@databricks.com>
Committed: Tue Mar 31 11:25:21 2015 -0700

----------------------------------------------------------------------
 python/pyspark/mllib/feature.py |  6 ++++++
 python/pyspark/mllib/tests.py   | 14 ++++++++++++++
 2 files changed, 20 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/46de6c05/python/pyspark/mllib/feature.py
----------------------------------------------------------------------
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 0ffe092..4bfe301 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -244,6 +244,12 @@ class IDFModel(JavaVectorTransformer):
         x = _convert_to_vector(x)
         return JavaVectorTransformer.transform(self, x)
 
+    def idf(self):
+        """
+        Returns the current IDF vector.
+        """
+        return self.call('idf')
+
 
 class IDF(object):
     """

http://git-wip-us.apache.org/repos/asf/spark/blob/46de6c05/python/pyspark/mllib/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 1550196..3bb0f0c 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -41,6 +41,7 @@ from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, VectorUDT,
_
 from pyspark.mllib.regression import LabeledPoint
 from pyspark.mllib.random import RandomRDDs
 from pyspark.mllib.stat import Statistics
+from pyspark.mllib.feature import IDF
 from pyspark.serializers import PickleSerializer
 from pyspark.sql import SQLContext
 from pyspark.tests import ReusedPySparkTestCase as PySparkTestCase
@@ -620,6 +621,19 @@ class ChiSqTestTests(PySparkTestCase):
         self.assertEqual(len(chi), num_cols)
         self.assertIsNotNone(chi[1000])
 
+
+class FeatureTest(PySparkTestCase):
+    def test_idf_model(self):
+        data = [
+            Vectors.dense([1, 2, 6, 0, 2, 3, 1, 1, 0, 0, 3]),
+            Vectors.dense([1, 3, 0, 1, 3, 0, 0, 2, 0, 0, 1]),
+            Vectors.dense([1, 4, 1, 0, 0, 4, 9, 0, 1, 2, 0]),
+            Vectors.dense([2, 1, 0, 3, 0, 0, 5, 0, 2, 3, 9])
+        ]
+        model = IDF().fit(self.sc.parallelize(data, 2))
+        idf = model.idf()
+        self.assertEqual(len(idf), 11)
+
 if __name__ == "__main__":
     if not _have_scipy:
         print "NOTE: Skipping SciPy tests as it does not seem to be installed"


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org


Mime
View raw message