Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 9BE7D200C18 for ; Sat, 28 Jan 2017 01:04:06 +0100 (CET) Received: by cust-asf.ponee.io (Postfix) id 9A69D160B5C; Sat, 28 Jan 2017 00:04:06 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id BD870160B5B for ; Sat, 28 Jan 2017 01:04:05 +0100 (CET) Received: (qmail 16791 invoked by uid 500); 28 Jan 2017 00:04:05 -0000 Mailing-List: contact commits-help@spark.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Delivered-To: mailing list commits@spark.apache.org Received: (qmail 16782 invoked by uid 99); 28 Jan 2017 00:04:04 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Sat, 28 Jan 2017 00:04:04 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id D8BC1DFC47; Sat, 28 Jan 2017 00:04:04 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: jkbradley@apache.org To: commits@spark.apache.org Message-Id: X-Mailer: ASF-Git Admin Mailer Subject: spark git commit: [SPARK-19336][ML][PYSPARK] LinearSVC Python API Date: Sat, 28 Jan 2017 00:04:04 +0000 (UTC) archived-at: Sat, 28 Jan 2017 00:04:06 -0000 Repository: spark Updated Branches: refs/heads/master 21aa8c32b -> bb1a1fe05 [SPARK-19336][ML][PYSPARK] LinearSVC Python API ## What changes were proposed in this pull request? Add Python API for the newly added LinearSVC algorithm. ## How was this patch tested? Add new doc string test. Author: wm624@hotmail.com Closes #16694 from wangmiao1981/ser. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bb1a1fe0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bb1a1fe0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bb1a1fe0 Branch: refs/heads/master Commit: bb1a1fe05e293c480c88123d4c83a6b8c25f6e2e Parents: 21aa8c3 Author: wm624@hotmail.com Authored: Fri Jan 27 16:03:53 2017 -0800 Committer: Joseph K. Bradley Committed: Fri Jan 27 16:03:53 2017 -0800 ---------------------------------------------------------------------- .../spark/ml/classification/LinearSVC.scala | 4 +- python/pyspark/ml/classification.py | 131 ++++++++++++++++++- .../pyspark/ml/param/_shared_params_code_gen.py | 2 + python/pyspark/ml/param/shared.py | 24 ++++ 4 files changed, 158 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/bb1a1fe0/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala index c4e93bf..3b14c4b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala @@ -114,7 +114,7 @@ class LinearSVC @Since("2.2.0") ( setDefault(standardization -> true) /** - * Sets the value of param [[weightCol]]. + * Set the value of param [[weightCol]]. * If this is not set or empty, we treat all instance weights as 1.0. * Default is not set, so all instances have weight one. * @@ -421,7 +421,7 @@ private class LinearSVCCostFun( /** * LinearSVCAggregator computes the gradient and loss for hinge loss function, as used - * in binary classification for instances in sparse or dense vector in a online fashion. + * in binary classification for instances in sparse or dense vector in an online fashion. * * Two LinearSVCAggregator can be merged together to have a summary of loss and gradient of * the corresponding joint dataset. http://git-wip-us.apache.org/repos/asf/spark/blob/bb1a1fe0/python/pyspark/ml/classification.py ---------------------------------------------------------------------- diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 5fe4bab..f10556c 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -31,7 +31,8 @@ from pyspark.sql.functions import udf, when from pyspark.sql.types import ArrayType, DoubleType from pyspark.storagelevel import StorageLevel -__all__ = ['LogisticRegression', 'LogisticRegressionModel', +__all__ = ['LinearSVC', 'LinearSVCModel', + 'LogisticRegression', 'LogisticRegressionModel', 'LogisticRegressionSummary', 'LogisticRegressionTrainingSummary', 'BinaryLogisticRegressionSummary', 'BinaryLogisticRegressionTrainingSummary', 'DecisionTreeClassifier', 'DecisionTreeClassificationModel', @@ -60,6 +61,134 @@ class JavaClassificationModel(JavaPredictionModel): @inherit_doc +class LinearSVC(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter, + HasRegParam, HasTol, HasRawPredictionCol, HasFitIntercept, HasStandardization, + HasThreshold, HasWeightCol, HasAggregationDepth, JavaMLWritable, JavaMLReadable): + """ + `Linear SVM Classifier `_ + This binary classifier optimizes the Hinge Loss using the OWLQN optimizer. + + >>> from pyspark.sql import Row + >>> from pyspark.ml.linalg import Vectors + >>> df = sc.parallelize([ + ... Row(label=1.0, features=Vectors.dense(1.0, 1.0, 1.0)), + ... Row(label=0.0, features=Vectors.dense(1.0, 2.0, 3.0))]).toDF() + >>> svm = LinearSVC(maxIter=5, regParam=0.01) + >>> model = svm.fit(df) + >>> model.coefficients + DenseVector([0.0, -0.2792, -0.1833]) + >>> model.intercept + 1.0206118982229047 + >>> model.numClasses + 2 + >>> model.numFeatures + 3 + >>> test0 = sc.parallelize([Row(features=Vectors.dense(-1.0, -1.0, -1.0))]).toDF() + >>> result = model.transform(test0).head() + >>> result.prediction + 1.0 + >>> result.rawPrediction + DenseVector([-1.4831, 1.4831]) + >>> svm.setParams("vector") + Traceback (most recent call last): + ... + TypeError: Method setParams forces keyword arguments. + >>> svm_path = temp_path + "/svm" + >>> svm.save(svm_path) + >>> svm2 = LinearSVC.load(svm_path) + >>> svm2.getMaxIter() + 5 + >>> model_path = temp_path + "/svm_model" + >>> model.save(model_path) + >>> model2 = LinearSVCModel.load(model_path) + >>> model.coefficients[0] == model2.coefficients[0] + True + >>> model.intercept == model2.intercept + True + + .. versionadded:: 2.2.0 + """ + + @keyword_only + def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", + maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", + fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, + aggregationDepth=2): + """ + __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ + maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", \ + fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, \ + aggregationDepth=2): + """ + super(LinearSVC, self).__init__() + self._java_obj = self._new_java_obj( + "org.apache.spark.ml.classification.LinearSVC", self.uid) + self._setDefault(maxIter=100, regParam=0.0, tol=1e-6, fitIntercept=True, + standardization=True, threshold=0.0, aggregationDepth=2) + kwargs = self.__init__._input_kwargs + self.setParams(**kwargs) + + @keyword_only + @since("2.2.0") + def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", + maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", + fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, + aggregationDepth=2): + """ + setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ + maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", \ + fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, \ + aggregationDepth=2): + Sets params for Linear SVM Classifier. + """ + kwargs = self.setParams._input_kwargs + return self._set(**kwargs) + + def _create_model(self, java_model): + return LinearSVCModel(java_model) + + +class LinearSVCModel(JavaModel, JavaClassificationModel, JavaMLWritable, JavaMLReadable): + """ + Model fitted by LinearSVC. + + .. versionadded:: 2.2.0 + """ + + @property + @since("2.2.0") + def coefficients(self): + """ + Model coefficients of Linear SVM Classifier. + """ + return self._call_java("coefficients") + + @property + @since("2.2.0") + def intercept(self): + """ + Model intercept of Linear SVM Classifier. + """ + return self._call_java("intercept") + + @property + @since("2.2.0") + def numClasses(self): + """ + Number of classes. + """ + return self._call_java("numClasses") + + @property + @since("2.2.0") + def numFeatures(self): + """ + Number of features. + """ + return self._call_java("numFeatures") + + +@inherit_doc class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter, HasRegParam, HasTol, HasProbabilityCol, HasRawPredictionCol, HasElasticNetParam, HasFitIntercept, HasStandardization, HasThresholds, http://git-wip-us.apache.org/repos/asf/spark/blob/bb1a1fe0/python/pyspark/ml/param/_shared_params_code_gen.py ---------------------------------------------------------------------- diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py index 9295912..51d49b5 100644 --- a/python/pyspark/ml/param/_shared_params_code_gen.py +++ b/python/pyspark/ml/param/_shared_params_code_gen.py @@ -143,6 +143,8 @@ if __name__ == "__main__": "The class with largest value p/t is predicted, where p is the original " + "probability of that class and t is the class's threshold.", None, "TypeConverters.toListFloat"), + ("threshold", "threshold in binary classification prediction, in range [0, 1]", + "0.5", "TypeConverters.toFloat"), ("weightCol", "weight column name. If this is not set or empty, we treat " + "all instance weights as 1.0.", None, "TypeConverters.toString"), ("solver", "the solver algorithm for optimization. If this is not set or empty, " + http://git-wip-us.apache.org/repos/asf/spark/blob/bb1a1fe0/python/pyspark/ml/param/shared.py ---------------------------------------------------------------------- diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py index cc59693..163a0e2 100644 --- a/python/pyspark/ml/param/shared.py +++ b/python/pyspark/ml/param/shared.py @@ -490,6 +490,30 @@ class HasThresholds(Params): return self.getOrDefault(self.thresholds) +class HasThreshold(Params): + """ + Mixin for param threshold: threshold in binary classification prediction, in range [0, 1] + """ + + threshold = Param(Params._dummy(), "threshold", "threshold in binary classification prediction, in range [0, 1]", typeConverter=TypeConverters.toFloat) + + def __init__(self): + super(HasThreshold, self).__init__() + self._setDefault(threshold=0.5) + + def setThreshold(self, value): + """ + Sets the value of :py:attr:`threshold`. + """ + return self._set(threshold=value) + + def getThreshold(self): + """ + Gets the value of threshold or its default value. + """ + return self.getOrDefault(self.threshold) + + class HasWeightCol(Params): """ Mixin for param weightCol: weight column name. If this is not set or empty, we treat all instance weights as 1.0. --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org For additional commands, e-mail: commits-help@spark.apache.org