spark-reviews mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From 3ourroom <...@git.apache.org>
Subject [GitHub] spark pull request: [SPARK-11940][PYSPARK] Python API for ml.clust...
Date Tue, 15 Dec 2015 10:30:12 GMT
Github user 3ourroom commented on a diff in the pull request:

    https://github.com/apache/spark/pull/10242#discussion_r47619640
  
    --- Diff: python/pyspark/ml/clustering.py ---
    @@ -167,6 +167,200 @@ def getInitSteps(self):
             return self.getOrDefault(self.initSteps)
     
     
    +class LDAModel(JavaModel):
    +    """ A clustering model derived from the LDA method.
    +
    +    Latent Dirichlet Allocation (LDA), a topic model designed for text documents.
    +    Terminology
    +    - "word" = "term": an element of the vocabulary
    +    - "token": instance of a term appearing in a document
    +    - "topic": multinomial distribution over words representing some concept
    +    References:
    +    - Original LDA paper (journal version):
    +    Blei, Ng, and Jordan.  "Latent Dirichlet Allocation."  JMLR, 2003.
    +    """
    +
    +    @since("1.7.0")
    +    def isDistributed(self):
    +        """Indicates whether this instance is of type DistributedLDAModel"""
    +        return self._call_java("isDistributed")
    +
    +    @since("1.7.0")
    +    def vocabSize(self):
    +        """Vocabulary size (number of terms or terms in the vocabulary)"""
    +        return self._call_java("vocabSize")
    +
    +    @since("1.7.0")
    +    def topicsMatrix(self):
    +        """Inferred topics, where each topic is represented by a distribution over terms."""
    +        return self._call_java("topicsMatrix")
    +
    +    @since("1.7.0")
    +    def logLikelihood(self, dataset):
    +        """Calculates a lower bound on the log likelihood of the entire corpus."""
    +        return self._call_java("logLikelihood", dataset)
    +
    +    @since("1.7.0")
    +    def logPerplexity(self, dataset):
    +        """Calculate an upper bound bound on perplexity.  (Lower is better.)"""
    +        return self._call_java("logPerplexity", dataset)
    +
    +    @since("1.7.0")
    +    def describeTopics(self, maxTermsPerTopic=10):
    +        """Return the topics described by weighted terms.
    +
    +        WARNING: If vocabSize and k are large, this can return a large object!
    +
    +        :param maxTermsPerTopic: Maximum number of terms to collect for each topic.
    +            (default: vocabulary size)
    +        :return: Array over topics. Each topic is represented as a pair of matching arrays:
    +            (term indices, term weights in topic).
    +            Each topic's terms are sorted in order of decreasing weight.
    +        """
    +        return self._call_java("describeTopics", maxTermsPerTopic)
    +
    +
    +class DistributedLDAModel(LDAModel):
    +    """
    +    Model fitted by LDA.
    +
    +    .. versionadded:: 1.7.0
    +    """
    +    def toLocal(self):
    +        return self._call_java("toLocal")
    +
    +
    +class LocalLDAModel(LDAModel):
    +    """
    +    Model fitted by LDA.
    +
    +    .. versionadded:: 1.7.0
    +    """
    +    pass
    +
    +
    +class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed, HasCheckpointInterval):
    +    """ A clustering model derived from the LDA method.
    +
    +    Latent Dirichlet Allocation (LDA), a topic model designed for text documents.
    +    Terminology
    +    - "word" = "term": an element of the vocabulary
    +    - "token": instance of a term appearing in a document
    +    - "topic": multinomial distribution over words representing some concept
    +    References:
    +    - Original LDA paper (journal version):
    +    Blei, Ng, and Jordan.  "Latent Dirichlet Allocation."  JMLR, 2003.
    +
    +    >>> from pyspark.mllib.linalg import Vectors, SparseVector
    +    >>> from pyspark.ml.clustering import LDA
    +    >>> df = sqlContext.createDataFrame([[1, Vectors.dense([0.0, 1.0])], \
    +        [2, SparseVector(2, {0: 1.0})],], ["id", "features"])
    +    >>> lda = LDA(k=2, seed=1, optimizer="em")
    +    >>> model = lda.fit(df)
    +    >>> model.isDistributed()
    +    True
    +    >>> localModel = model.toLocal()
    +    >>> localModel.isDistributed()
    +    False
    +    >>> model.vocabSize()
    +    2
    +    >>> model.describeTopics().show()
    +    +-----+-----------+--------------------+
    +    |topic|termIndices|         termWeights|
    +    +-----+-----------+--------------------+
    +    |    0|     [1, 0]|[0.50401530077160...|
    +    |    1|     [0, 1]|[0.50401530077160...|
    +    +-----+-----------+--------------------+
    +    ...
    +    >>> model.topicsMatrix()
    +    DenseMatrix(2, 2, [0.496, 0.504, 0.504, 0.496], 0)
    +
    +    .. versionadded:: 1.7.0
    +    """
    +
    +    # a placeholder to make it appear in the generated doc
    +    k = Param(Params._dummy(), "k", "number of clusters to create")
    +    optimizer = Param(Params._dummy(), "optimizer", "LDA optimizer")
    +
    +    @keyword_only
    +    def __init__(self, featuresCol="features", k=2,
    +                 optimizer="online", learningOffset=5, learningDecay=0.51,
    +                 subsamplingRate=0.05, optimizeDocConcentration=True,
    +                 checkpointInterval=10, maxIter=20, seed=None):
    +        """
    +        __init__(self, featuresCol="features", predictionCol="prediction", k=2, \
    +                 initMode="k-means||", initSteps=5, tol=1e-4, maxIter=20, seed=None)
    +        """
    +        super(LDA, self).__init__()
    +        self._java_obj = self._new_java_obj("org.apache.spark.ml.clustering.LDA", self.uid)
    +        self.k = Param(self, "k", "number of clusters to create")
    +        self.optimizer = Param(self, "optimizer", "LDA optimizer")
    +        self._setDefault(k=2, optimizer="online", maxIter=20)
    +        kwargs = self.__init__._input_kwargs
    +        self.setParams(**kwargs)
    +
    +    def _create_model(self, java_model):
    +        if self.getOptimizer() == "em":
    +            return DistributedLDAModel(java_model)
    +        else:
    +            return LocalLDAModel(java_model)
    +
    +    @keyword_only
    +    @since("1.7.0")
    +    def setParams(self, featuresCol="features", k=2,
    +                  optimizer="online", learningOffset=5, learningDecay=0.51,
    +                  subsamplingRate=0.05, optimizeDocConcentration=True,
    +                  checkpointInterval=10, maxIter=20, seed=None):
    +        """
    +        ssetParams(self, featuresCol="features", k=2,
    --- End diff --
    
    
    NAVER - http://www.naver.com/
    --------------------------------------------
    
    3ourroom@naver.com 님께 보내신 메일 <Re: [spark] [SPARK-11940][PYSPARK] Python
API for ml.clustering.LDA (#10242)> 이 다음과 같은 이유로 전송 실패했습니다.
    
    --------------------------------------------
    
    받는 사람이 회원님의 메일을 수신차단 하였습니다. 
    
    
    --------------------------------------------



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


Mime
View raw message