Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 5D540200B21 for ; Fri, 10 Jun 2016 13:27:00 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id 5BE8D160A38; Fri, 10 Jun 2016 11:27:00 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id A5B3E160A04 for ; Fri, 10 Jun 2016 13:26:59 +0200 (CEST) Received: (qmail 22149 invoked by uid 500); 10 Jun 2016 11:26:58 -0000 Mailing-List: contact commits-help@spark.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Delivered-To: mailing list commits@spark.apache.org Received: (qmail 22140 invoked by uid 99); 10 Jun 2016 11:26:58 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 10 Jun 2016 11:26:58 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 8F4F1DFC74; Fri, 10 Jun 2016 11:26:58 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: srowen@apache.org To: commits@spark.apache.org Message-Id: X-Mailer: ASF-Git Admin Mailer Subject: spark git commit: [SPARK-15837][ML][PYSPARK] Word2vec python add maxsentence parameter Date: Fri, 10 Jun 2016 11:26:58 +0000 (UTC) archived-at: Fri, 10 Jun 2016 11:27:00 -0000 Repository: spark Updated Branches: refs/heads/master 16ca32eac -> cdd7f5a57 [SPARK-15837][ML][PYSPARK] Word2vec python add maxsentence parameter ## What changes were proposed in this pull request? Word2vec python add maxsentence parameter. ## How was this patch tested? Existing test. Author: WeichenXu Closes #13578 from WeichenXu123/word2vec_python_add_maxsentence. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cdd7f5a5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cdd7f5a5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cdd7f5a5 Branch: refs/heads/master Commit: cdd7f5a57a21d4a8f93456d149f65859c96190cf Parents: 16ca32e Author: WeichenXu Authored: Fri Jun 10 12:26:53 2016 +0100 Committer: Sean Owen Committed: Fri Jun 10 12:26:53 2016 +0100 ---------------------------------------------------------------------- python/pyspark/ml/feature.py | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/cdd7f5a5/python/pyspark/ml/feature.py ---------------------------------------------------------------------- diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index ebe1300..bfb2fb7 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2244,28 +2244,33 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has windowSize = Param(Params._dummy(), "windowSize", "the window size (context words from [-window, window]). Default value is 5", typeConverter=TypeConverters.toInt) + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", + "Maximum length (in words) of each sentence in the input data. " + + "Any sentence longer than this threshold will " + + "be divided into chunks up to the size.", + typeConverter=TypeConverters.toInt) @keyword_only def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, - seed=None, inputCol=None, outputCol=None, windowSize=5): + seed=None, inputCol=None, outputCol=None, windowSize=5, maxSentenceLength=1000): """ __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, \ - seed=None, inputCol=None, outputCol=None, windowSize=5) + seed=None, inputCol=None, outputCol=None, windowSize=5, maxSentenceLength=1000) """ super(Word2Vec, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Word2Vec", self.uid) self._setDefault(vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, - seed=None, windowSize=5) + seed=None, windowSize=5, maxSentenceLength=1000) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @keyword_only @since("1.4.0") def setParams(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, - seed=None, inputCol=None, outputCol=None, windowSize=5): + seed=None, inputCol=None, outputCol=None, windowSize=5, maxSentenceLength=1000): """ setParams(self, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, seed=None, \ - inputCol=None, outputCol=None, windowSize=5) + inputCol=None, outputCol=None, windowSize=5, maxSentenceLength=1000) Sets params for this Word2Vec. """ kwargs = self.setParams._input_kwargs @@ -2327,6 +2332,20 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has """ return self.getOrDefault(self.windowSize) + @since("2.0.0") + def setMaxSentenceLength(self, value): + """ + Sets the value of :py:attr:`maxSentenceLength`. + """ + return self._set(maxSentenceLength=value) + + @since("2.0.0") + def getMaxSentenceLength(self): + """ + Gets the value of maxSentenceLength or its default value. + """ + return self.getOrDefault(self.maxSentenceLength) + def _create_model(self, java_model): return Word2VecModel(java_model) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org For additional commands, e-mail: commits-help@spark.apache.org