spark-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jkbrad...@apache.org
Subject spark git commit: [SPARK-9768] [PYSPARK] [ML] Add Python API and user guide for ml.feature.ElementwiseProduct
Date Tue, 18 Aug 2015 00:25:45 GMT
Repository: spark
Updated Branches:
  refs/heads/master 52ae95257 -> 0076e8212


[SPARK-9768] [PYSPARK] [ML] Add Python API and user guide for ml.feature.ElementwiseProduct

Add Python API, user guide and example for ml.feature.ElementwiseProduct.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #8061 from yanboliang/SPARK-9768.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0076e821
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0076e821
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0076e821

Branch: refs/heads/master
Commit: 0076e8212334c613599dcbc2ac23f49e9e50cc44
Parents: 52ae952
Author: Yanbo Liang <ybliang8@gmail.com>
Authored: Mon Aug 17 17:25:41 2015 -0700
Committer: Joseph K. Bradley <joseph@databricks.com>
Committed: Mon Aug 17 17:25:41 2015 -0700

----------------------------------------------------------------------
 docs/ml-features.md          | 23 +++++++++++---
 python/pyspark/ml/feature.py | 67 ++++++++++++++++++++++++++++++++++++---
 2 files changed, 81 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/0076e821/docs/ml-features.md
----------------------------------------------------------------------
diff --git a/docs/ml-features.md b/docs/ml-features.md
index cec2cbe..6b2e36b 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1212,7 +1212,7 @@ v_N
 This example below demonstrates how to transform vectors using a transforming vector value.
 
 <div class="codetabs">
-<div data-lang="scala">
+<div data-lang="scala" markdown="1">
 {% highlight scala %}
 import org.apache.spark.ml.feature.ElementwiseProduct
 import org.apache.spark.mllib.linalg.Vectors
@@ -1229,12 +1229,12 @@ val transformer = new ElementwiseProduct()
   .setOutputCol("transformedVector")
 
 // Batch transform the vectors to create new column:
-val transformedData = transformer.transform(dataFrame)
+transformer.transform(dataFrame).show()
 
 {% endhighlight %}
 </div>
 
-<div data-lang="java">
+<div data-lang="java" markdown="1">
 {% highlight java %}
 import com.google.common.collect.Lists;
 
@@ -1267,10 +1267,25 @@ ElementwiseProduct transformer = new ElementwiseProduct()
   .setInputCol("vector")
   .setOutputCol("transformedVector");
 // Batch transform the vectors to create new column:
-DataFrame transformedData = transformer.transform(dataFrame);
+transformer.transform(dataFrame).show();
 
 {% endhighlight %}
 </div>
+
+<div data-lang="python" markdown="1">
+{% highlight python %}
+from pyspark.ml.feature import ElementwiseProduct
+from pyspark.mllib.linalg import Vectors
+
+data = [(Vectors.dense([1.0, 2.0, 3.0]),), (Vectors.dense([4.0, 5.0, 6.0]),)]
+df = sqlContext.createDataFrame(data, ["vector"])
+transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]), 
+                                 inputCol="vector", outputCol="transformedVector")
+transformer.transform(df).show()
+
+{% endhighlight %}
+</div>
+
 </div>
 
 ## VectorAssembler

http://git-wip-us.apache.org/repos/asf/spark/blob/0076e821/python/pyspark/ml/feature.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 535d553..04b2b2c 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -26,11 +26,11 @@ from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaTransformer
 from pyspark.mllib.common import inherit_doc
 from pyspark.mllib.linalg import _convert_to_vector
 
-__all__ = ['Binarizer', 'Bucketizer', 'HashingTF', 'IDF', 'IDFModel', 'NGram', 'Normalizer',
-           'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', 'StandardScaler',
-           'StandardScalerModel', 'StringIndexer', 'StringIndexerModel', 'Tokenizer',
-           'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel', 'PCA',
-           'PCAModel', 'RFormula', 'RFormulaModel']
+__all__ = ['Binarizer', 'Bucketizer', 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel',
+           'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer',
+           'StandardScaler', 'StandardScalerModel', 'StringIndexer', 'StringIndexerModel',
+           'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel',
+           'PCA', 'PCAModel', 'RFormula', 'RFormulaModel']
 
 
 @inherit_doc
@@ -167,6 +167,63 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol):
 
 
 @inherit_doc
+class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol):
+    """
+    Outputs the Hadamard product (i.e., the element-wise product) of each input vector
+    with a provided "weight" vector. In other words, it scales each column of the dataset
+    by a scalar multiplier.
+
+    >>> from pyspark.mllib.linalg import Vectors
+    >>> df = sqlContext.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]),)], ["values"])
+    >>> ep = ElementwiseProduct(scalingVec=Vectors.dense([1.0, 2.0, 3.0]),
+    ...     inputCol="values", outputCol="eprod")
+    >>> ep.transform(df).head().eprod
+    DenseVector([2.0, 2.0, 9.0])
+    >>> ep.setParams(scalingVec=Vectors.dense([2.0, 3.0, 5.0])).transform(df).head().eprod
+    DenseVector([4.0, 3.0, 15.0])
+    """
+
+    # a placeholder to make it appear in the generated doc
+    scalingVec = Param(Params._dummy(), "scalingVec", "vector for hadamard product, " +
+                       "it must be MLlib Vector type.")
+
+    @keyword_only
+    def __init__(self, scalingVec=None, inputCol=None, outputCol=None):
+        """
+        __init__(self, scalingVec=None, inputCol=None, outputCol=None)
+        """
+        super(ElementwiseProduct, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ElementwiseProduct",
+                                            self.uid)
+        self.scalingVec = Param(self, "scalingVec", "vector for hadamard product, " +
+                                "it must be MLlib Vector type.")
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, scalingVec=None, inputCol=None, outputCol=None):
+        """
+        setParams(self, scalingVec=None, inputCol=None, outputCol=None)
+        Sets params for this ElementwiseProduct.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
+    def setScalingVec(self, value):
+        """
+        Sets the value of :py:attr:`scalingVec`.
+        """
+        self._paramMap[self.scalingVec] = value
+        return self
+
+    def getScalingVec(self):
+        """
+        Gets the value of scalingVec or its default value.
+        """
+        return self.getOrDefault(self.scalingVec)
+
+
+@inherit_doc
 class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures):
     """
     Maps a sequence of terms to their term frequencies using the


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org


Mime
View raw message