Return-Path: X-Original-To: apmail-spark-commits-archive@minotaur.apache.org Delivered-To: apmail-spark-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 79FA6184F7 for ; Fri, 21 Aug 2015 21:19:30 +0000 (UTC) Received: (qmail 96788 invoked by uid 500); 21 Aug 2015 21:19:30 -0000 Delivered-To: apmail-spark-commits-archive@spark.apache.org Received: (qmail 96706 invoked by uid 500); 21 Aug 2015 21:19:30 -0000 Mailing-List: contact commits-help@spark.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Delivered-To: mailing list commits@spark.apache.org Received: (qmail 96626 invoked by uid 99); 21 Aug 2015 21:19:30 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 21 Aug 2015 21:19:30 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 0DB74E1144; Fri, 21 Aug 2015 21:19:30 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: meng@apache.org To: commits@spark.apache.org Date: Fri, 21 Aug 2015 21:19:31 -0000 Message-Id: In-Reply-To: References: X-Mailer: ASF-Git Admin Mailer Subject: [3/3] spark git commit: [SPARK-9864] [DOC] [MLlib] [SQL] Replace since in scaladoc to Since annotation [SPARK-9864] [DOC] [MLlib] [SQL] Replace since in scaladoc to Since annotation Author: MechCoder Closes #8352 from MechCoder/since. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f5b028ed Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f5b028ed Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f5b028ed Branch: refs/heads/master Commit: f5b028ed2f1ad6de43c8b50ebf480e1b6c047035 Parents: d89cc38 Author: MechCoder Authored: Fri Aug 21 14:19:24 2015 -0700 Committer: Xiangrui Meng Committed: Fri Aug 21 14:19:24 2015 -0700 ---------------------------------------------------------------------- .../classification/ClassificationModel.scala | 8 +- .../classification/LogisticRegression.scala | 30 +++--- .../spark/mllib/classification/NaiveBayes.scala | 7 +- .../apache/spark/mllib/classification/SVM.scala | 28 ++--- .../mllib/clustering/GaussianMixture.scala | 28 ++--- .../mllib/clustering/GaussianMixtureModel.scala | 28 ++--- .../apache/spark/mllib/clustering/KMeans.scala | 50 ++++----- .../spark/mllib/clustering/KMeansModel.scala | 27 ++--- .../org/apache/spark/mllib/clustering/LDA.scala | 56 +++++----- .../spark/mllib/clustering/LDAModel.scala | 69 +++++------- .../spark/mllib/clustering/LDAOptimizer.scala | 24 ++--- .../clustering/PowerIterationClustering.scala | 38 +++---- .../mllib/clustering/StreamingKMeans.scala | 35 +++--- .../BinaryClassificationMetrics.scala | 26 ++--- .../mllib/evaluation/MulticlassMetrics.scala | 20 ++-- .../mllib/evaluation/MultilabelMetrics.scala | 9 +- .../spark/mllib/evaluation/RankingMetrics.scala | 10 +- .../mllib/evaluation/RegressionMetrics.scala | 14 +-- .../spark/mllib/fpm/AssociationRules.scala | 20 ++-- .../org/apache/spark/mllib/fpm/FPGrowth.scala | 22 ++-- .../apache/spark/mllib/linalg/Matrices.scala | 106 ++++++++----------- .../linalg/SingularValueDecomposition.scala | 4 +- .../org/apache/spark/mllib/linalg/Vectors.scala | 90 +++++----------- .../mllib/linalg/distributed/BlockMatrix.scala | 88 +++++++-------- .../linalg/distributed/CoordinateMatrix.scala | 40 +++---- .../linalg/distributed/DistributedMatrix.scala | 4 +- .../linalg/distributed/IndexedRowMatrix.scala | 38 +++---- .../mllib/linalg/distributed/RowMatrix.scala | 39 +++---- .../apache/spark/mllib/recommendation/ALS.scala | 22 ++-- .../MatrixFactorizationModel.scala | 28 +++-- .../regression/GeneralizedLinearAlgorithm.scala | 24 ++--- .../mllib/regression/IsotonicRegression.scala | 22 ++-- .../spark/mllib/regression/LabeledPoint.scala | 7 +- .../apache/spark/mllib/regression/Lasso.scala | 25 ++--- .../mllib/regression/LinearRegression.scala | 25 ++--- .../mllib/regression/RegressionModel.scala | 12 +-- .../mllib/regression/RidgeRegression.scala | 25 ++--- .../regression/StreamingLinearAlgorithm.scala | 18 ++-- .../apache/spark/mllib/stat/KernelDensity.scala | 12 +-- .../stat/MultivariateOnlineSummarizer.scala | 24 ++--- .../stat/MultivariateStatisticalSummary.scala | 19 ++-- .../apache/spark/mllib/stat/Statistics.scala | 30 +++--- .../distribution/MultivariateGaussian.scala | 8 +- .../apache/spark/mllib/tree/DecisionTree.scala | 28 +++-- .../spark/mllib/tree/GradientBoostedTrees.scala | 20 ++-- .../apache/spark/mllib/tree/RandomForest.scala | 20 ++-- .../spark/mllib/tree/configuration/Algo.scala | 4 +- .../tree/configuration/BoostingStrategy.scala | 12 +-- .../mllib/tree/configuration/FeatureType.scala | 4 +- .../tree/configuration/QuantileStrategy.scala | 4 +- .../mllib/tree/configuration/Strategy.scala | 24 ++--- .../spark/mllib/tree/impurity/Entropy.scala | 10 +- .../apache/spark/mllib/tree/impurity/Gini.scala | 10 +- .../spark/mllib/tree/impurity/Impurity.scala | 8 +- .../spark/mllib/tree/impurity/Variance.scala | 10 +- .../spark/mllib/tree/loss/AbsoluteError.scala | 6 +- .../apache/spark/mllib/tree/loss/LogLoss.scala | 6 +- .../org/apache/spark/mllib/tree/loss/Loss.scala | 8 +- .../apache/spark/mllib/tree/loss/Losses.scala | 10 +- .../spark/mllib/tree/loss/SquaredError.scala | 6 +- .../mllib/tree/model/DecisionTreeModel.scala | 22 ++-- .../mllib/tree/model/InformationGainStats.scala | 4 +- .../apache/spark/mllib/tree/model/Node.scala | 8 +- .../apache/spark/mllib/tree/model/Predict.scala | 4 +- .../apache/spark/mllib/tree/model/Split.scala | 4 +- .../mllib/tree/model/treeEnsembleModels.scala | 26 +++-- .../org/apache/spark/mllib/tree/package.scala | 1 - .../org/apache/spark/mllib/util/MLUtils.scala | 36 +++---- 68 files changed, 692 insertions(+), 862 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/f5b028ed/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala index ba73024..a29b425 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala @@ -19,7 +19,7 @@ package org.apache.spark.mllib.classification import org.json4s.{DefaultFormats, JValue} -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.Vector import org.apache.spark.rdd.RDD @@ -36,8 +36,8 @@ trait ClassificationModel extends Serializable { * * @param testData RDD representing data points to be predicted * @return an RDD[Double] where each entry contains the corresponding prediction - * @since 0.8.0 */ + @Since("0.8.0") def predict(testData: RDD[Vector]): RDD[Double] /** @@ -45,16 +45,16 @@ trait ClassificationModel extends Serializable { * * @param testData array representing a single data point * @return predicted category from the trained model - * @since 0.8.0 */ + @Since("0.8.0") def predict(testData: Vector): Double /** * Predict values for examples stored in a JavaRDD. * @param testData JavaRDD representing data points to be predicted * @return a JavaRDD[java.lang.Double] where each entry contains the corresponding prediction - * @since 0.8.0 */ + @Since("0.8.0") def predict(testData: JavaRDD[Vector]): JavaRDD[java.lang.Double] = predict(testData.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Double]] } http://git-wip-us.apache.org/repos/asf/spark/blob/f5b028ed/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala index 268642a..e03e662 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala @@ -18,7 +18,7 @@ package org.apache.spark.mllib.classification import org.apache.spark.SparkContext -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.mllib.classification.impl.GLMClassificationModel import org.apache.spark.mllib.linalg.BLAS.dot import org.apache.spark.mllib.linalg.{DenseVector, Vector} @@ -85,8 +85,8 @@ class LogisticRegressionModel ( * in Binary Logistic Regression. An example with prediction score greater than or equal to * this threshold is identified as an positive, and negative otherwise. The default value is 0.5. * It is only used for binary classification. - * @since 1.0.0 */ + @Since("1.0.0") @Experimental def setThreshold(threshold: Double): this.type = { this.threshold = Some(threshold) @@ -97,8 +97,8 @@ class LogisticRegressionModel ( * :: Experimental :: * Returns the threshold (if any) used for converting raw prediction scores into 0/1 predictions. * It is only used for binary classification. - * @since 1.3.0 */ + @Since("1.3.0") @Experimental def getThreshold: Option[Double] = threshold @@ -106,8 +106,8 @@ class LogisticRegressionModel ( * :: Experimental :: * Clears the threshold so that `predict` will output raw prediction scores. * It is only used for binary classification. - * @since 1.0.0 */ + @Since("1.0.0") @Experimental def clearThreshold(): this.type = { threshold = None @@ -158,9 +158,7 @@ class LogisticRegressionModel ( } } - /** - * @since 1.3.0 - */ + @Since("1.3.0") override def save(sc: SparkContext, path: String): Unit = { GLMClassificationModel.SaveLoadV1_0.save(sc, path, this.getClass.getName, numFeatures, numClasses, weights, intercept, threshold) @@ -168,9 +166,7 @@ class LogisticRegressionModel ( override protected def formatVersion: String = "1.0" - /** - * @since 1.4.0 - */ + @Since("1.4.0") override def toString: String = { s"${super.toString}, numClasses = ${numClasses}, threshold = ${threshold.getOrElse("None")}" } @@ -178,9 +174,7 @@ class LogisticRegressionModel ( object LogisticRegressionModel extends Loader[LogisticRegressionModel] { - /** - * @since 1.3.0 - */ + @Since("1.3.0") override def load(sc: SparkContext, path: String): LogisticRegressionModel = { val (loadedClassName, version, metadata) = Loader.loadMetadata(sc, path) // Hard-code class name string in case it changes in the future @@ -261,8 +255,8 @@ object LogisticRegressionWithSGD { * @param miniBatchFraction Fraction of data to be used per iteration. * @param initialWeights Initial set of weights to be used. Array should be equal in size to * the number of features in the data. - * @since 1.0.0 */ + @Since("1.0.0") def train( input: RDD[LabeledPoint], numIterations: Int, @@ -284,8 +278,8 @@ object LogisticRegressionWithSGD { * @param stepSize Step size to be used for each iteration of gradient descent. * @param miniBatchFraction Fraction of data to be used per iteration. - * @since 1.0.0 */ + @Since("1.0.0") def train( input: RDD[LabeledPoint], numIterations: Int, @@ -306,8 +300,8 @@ object LogisticRegressionWithSGD { * @param numIterations Number of iterations of gradient descent to run. * @return a LogisticRegressionModel which has the weights and offset from training. - * @since 1.0.0 */ + @Since("1.0.0") def train( input: RDD[LabeledPoint], numIterations: Int, @@ -324,8 +318,8 @@ object LogisticRegressionWithSGD { * @param input RDD of (label, array of features) pairs. * @param numIterations Number of iterations of gradient descent to run. * @return a LogisticRegressionModel which has the weights and offset from training. - * @since 1.0.0 */ + @Since("1.0.0") def train( input: RDD[LabeledPoint], numIterations: Int): LogisticRegressionModel = { @@ -361,8 +355,8 @@ class LogisticRegressionWithLBFGS * Set the number of possible outcomes for k classes classification problem in * Multinomial Logistic Regression. * By default, it is binary logistic regression so k will be set to 2. - * @since 1.3.0 */ + @Since("1.3.0") @Experimental def setNumClasses(numClasses: Int): this.type = { require(numClasses > 1) http://git-wip-us.apache.org/repos/asf/spark/blob/f5b028ed/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index 2df91c0..dab3692 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -25,6 +25,7 @@ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.{Logging, SparkContext, SparkException} +import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{BLAS, DenseMatrix, DenseVector, SparseVector, Vector} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.{Loader, Saveable} @@ -444,8 +445,8 @@ object NaiveBayes { * * @param input RDD of `(label, array of features)` pairs. Every vector should be a frequency * vector or a count vector. - * @since 0.9.0 */ + @Since("0.9.0") def train(input: RDD[LabeledPoint]): NaiveBayesModel = { new NaiveBayes().run(input) } @@ -460,8 +461,8 @@ object NaiveBayes { * @param input RDD of `(label, array of features)` pairs. Every vector should be a frequency * vector or a count vector. * @param lambda The smoothing parameter - * @since 0.9.0 */ + @Since("0.9.0") def train(input: RDD[LabeledPoint], lambda: Double): NaiveBayesModel = { new NaiveBayes(lambda, Multinomial).run(input) } @@ -483,8 +484,8 @@ object NaiveBayes { * * @param modelType The type of NB model to fit from the enumeration NaiveBayesModels, can be * multinomial or bernoulli - * @since 0.9.0 */ + @Since("0.9.0") def train(input: RDD[LabeledPoint], lambda: Double, modelType: String): NaiveBayesModel = { require(supportedModelTypes.contains(modelType), s"NaiveBayes was created with an unknown modelType: $modelType.") http://git-wip-us.apache.org/repos/asf/spark/blob/f5b028ed/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala index 5b54fee..5f87269 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala @@ -18,7 +18,7 @@ package org.apache.spark.mllib.classification import org.apache.spark.SparkContext -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.mllib.classification.impl.GLMClassificationModel import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.optimization._ @@ -46,8 +46,8 @@ class SVMModel ( * Sets the threshold that separates positive predictions from negative predictions. An example * with prediction score greater than or equal to this threshold is identified as an positive, * and negative otherwise. The default value is 0.0. - * @since 1.3.0 */ + @Since("1.3.0") @Experimental def setThreshold(threshold: Double): this.type = { this.threshold = Some(threshold) @@ -57,16 +57,16 @@ class SVMModel ( /** * :: Experimental :: * Returns the threshold (if any) used for converting raw prediction scores into 0/1 predictions. - * @since 1.3.0 */ + @Since("1.3.0") @Experimental def getThreshold: Option[Double] = threshold /** * :: Experimental :: * Clears the threshold so that `predict` will output raw prediction scores. - * @since 1.0.0 */ + @Since("1.0.0") @Experimental def clearThreshold(): this.type = { threshold = None @@ -84,9 +84,7 @@ class SVMModel ( } } - /** - * @since 1.3.0 - */ + @Since("1.3.0") override def save(sc: SparkContext, path: String): Unit = { GLMClassificationModel.SaveLoadV1_0.save(sc, path, this.getClass.getName, numFeatures = weights.size, numClasses = 2, weights, intercept, threshold) @@ -94,9 +92,7 @@ class SVMModel ( override protected def formatVersion: String = "1.0" - /** - * @since 1.4.0 - */ + @Since("1.4.0") override def toString: String = { s"${super.toString}, numClasses = 2, threshold = ${threshold.getOrElse("None")}" } @@ -104,9 +100,7 @@ class SVMModel ( object SVMModel extends Loader[SVMModel] { - /** - * @since 1.3.0 - */ + @Since("1.3.0") override def load(sc: SparkContext, path: String): SVMModel = { val (loadedClassName, version, metadata) = Loader.loadMetadata(sc, path) // Hard-code class name string in case it changes in the future @@ -185,8 +179,8 @@ object SVMWithSGD { * @param miniBatchFraction Fraction of data to be used per iteration. * @param initialWeights Initial set of weights to be used. Array should be equal in size to * the number of features in the data. - * @since 0.8.0 */ + @Since("0.8.0") def train( input: RDD[LabeledPoint], numIterations: Int, @@ -209,8 +203,8 @@ object SVMWithSGD { * @param stepSize Step size to be used for each iteration of gradient descent. * @param regParam Regularization parameter. * @param miniBatchFraction Fraction of data to be used per iteration. - * @since 0.8.0 */ + @Since("0.8.0") def train( input: RDD[LabeledPoint], numIterations: Int, @@ -231,8 +225,8 @@ object SVMWithSGD { * @param regParam Regularization parameter. * @param numIterations Number of iterations of gradient descent to run. * @return a SVMModel which has the weights and offset from training. - * @since 0.8.0 */ + @Since("0.8.0") def train( input: RDD[LabeledPoint], numIterations: Int, @@ -250,8 +244,8 @@ object SVMWithSGD { * @param input RDD of (label, array of features) pairs. * @param numIterations Number of iterations of gradient descent to run. * @return a SVMModel which has the weights and offset from training. - * @since 0.8.0 */ + @Since("0.8.0") def train(input: RDD[LabeledPoint], numIterations: Int): SVMModel = { train(input, numIterations, 1.0, 0.01, 1.0) } http://git-wip-us.apache.org/repos/asf/spark/blob/f5b028ed/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala index bc27b1f..fcc9dfe 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala @@ -21,7 +21,7 @@ import scala.collection.mutable.IndexedSeq import breeze.linalg.{diag, DenseMatrix => BreezeMatrix, DenseVector => BDV, Vector => BV} -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.{BLAS, DenseMatrix, Matrices, Vector, Vectors} import org.apache.spark.mllib.stat.distribution.MultivariateGaussian @@ -62,8 +62,8 @@ class GaussianMixture private ( /** * Constructs a default instance. The default parameters are {k: 2, convergenceTol: 0.01, * maxIterations: 100, seed: random}. - * @since 1.3.0 */ + @Since("1.3.0") def this() = this(2, 0.01, 100, Utils.random.nextLong()) // number of samples per cluster to use when initializing Gaussians @@ -77,8 +77,8 @@ class GaussianMixture private ( * Set the initial GMM starting point, bypassing the random initialization. * You must call setK() prior to calling this method, and the condition * (model.k == this.k) must be met; failure will result in an IllegalArgumentException - * @since 1.3.0 */ + @Since("1.3.0") def setInitialModel(model: GaussianMixtureModel): this.type = { if (model.k == k) { initialModel = Some(model) @@ -90,14 +90,14 @@ class GaussianMixture private ( /** * Return the user supplied initial GMM, if supplied - * @since 1.3.0 */ + @Since("1.3.0") def getInitialModel: Option[GaussianMixtureModel] = initialModel /** * Set the number of Gaussians in the mixture model. Default: 2 - * @since 1.3.0 */ + @Since("1.3.0") def setK(k: Int): this.type = { this.k = k this @@ -105,14 +105,14 @@ class GaussianMixture private ( /** * Return the number of Gaussians in the mixture model - * @since 1.3.0 */ + @Since("1.3.0") def getK: Int = k /** * Set the maximum number of iterations to run. Default: 100 - * @since 1.3.0 */ + @Since("1.3.0") def setMaxIterations(maxIterations: Int): this.type = { this.maxIterations = maxIterations this @@ -120,15 +120,15 @@ class GaussianMixture private ( /** * Return the maximum number of iterations to run - * @since 1.3.0 */ + @Since("1.3.0") def getMaxIterations: Int = maxIterations /** * Set the largest change in log-likelihood at which convergence is * considered to have occurred. - * @since 1.3.0 */ + @Since("1.3.0") def setConvergenceTol(convergenceTol: Double): this.type = { this.convergenceTol = convergenceTol this @@ -137,14 +137,14 @@ class GaussianMixture private ( /** * Return the largest change in log-likelihood at which convergence is * considered to have occurred. - * @since 1.3.0 */ + @Since("1.3.0") def getConvergenceTol: Double = convergenceTol /** * Set the random seed - * @since 1.3.0 */ + @Since("1.3.0") def setSeed(seed: Long): this.type = { this.seed = seed this @@ -152,14 +152,14 @@ class GaussianMixture private ( /** * Return the random seed - * @since 1.3.0 */ + @Since("1.3.0") def getSeed: Long = seed /** * Perform expectation maximization - * @since 1.3.0 */ + @Since("1.3.0") def run(data: RDD[Vector]): GaussianMixtureModel = { val sc = data.sparkContext @@ -235,8 +235,8 @@ class GaussianMixture private ( /** * Java-friendly version of [[run()]] - * @since 1.3.0 */ + @Since("1.3.0") def run(data: JavaRDD[Vector]): GaussianMixtureModel = run(data.rdd) private def updateWeightsAndGaussians( http://git-wip-us.apache.org/repos/asf/spark/blob/f5b028ed/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala index 2fa0473..1a10a8b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala @@ -24,7 +24,7 @@ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.{Vector, Matrices, Matrix} import org.apache.spark.mllib.stat.distribution.MultivariateGaussian @@ -43,8 +43,8 @@ import org.apache.spark.sql.{SQLContext, Row} * the weight for Gaussian i, and weights.sum == 1 * @param gaussians Array of MultivariateGaussian where gaussians(i) represents * the Multivariate Gaussian (Normal) Distribution for Gaussian i - * @since 1.3.0 */ +@Since("1.3.0") @Experimental class GaussianMixtureModel( val weights: Array[Double], @@ -54,23 +54,21 @@ class GaussianMixtureModel( override protected def formatVersion = "1.0" - /** - * @since 1.4.0 - */ + @Since("1.4.0") override def save(sc: SparkContext, path: String): Unit = { GaussianMixtureModel.SaveLoadV1_0.save(sc, path, weights, gaussians) } /** * Number of gaussians in mixture - * @since 1.3.0 */ + @Since("1.3.0") def k: Int = weights.length /** * Maps given points to their cluster indices. - * @since 1.3.0 */ + @Since("1.3.0") def predict(points: RDD[Vector]): RDD[Int] = { val responsibilityMatrix = predictSoft(points) responsibilityMatrix.map(r => r.indexOf(r.max)) @@ -78,8 +76,8 @@ class GaussianMixtureModel( /** * Maps given point to its cluster index. - * @since 1.5.0 */ + @Since("1.5.0") def predict(point: Vector): Int = { val r = computeSoftAssignments(point.toBreeze.toDenseVector, gaussians, weights, k) r.indexOf(r.max) @@ -87,16 +85,16 @@ class GaussianMixtureModel( /** * Java-friendly version of [[predict()]] - * @since 1.4.0 */ + @Since("1.4.0") def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] = predict(points.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Integer]] /** * Given the input vectors, return the membership value of each vector * to all mixture components. - * @since 1.3.0 */ + @Since("1.3.0") def predictSoft(points: RDD[Vector]): RDD[Array[Double]] = { val sc = points.sparkContext val bcDists = sc.broadcast(gaussians) @@ -108,8 +106,8 @@ class GaussianMixtureModel( /** * Given the input vector, return the membership values to all mixture components. - * @since 1.4.0 */ + @Since("1.4.0") def predictSoft(point: Vector): Array[Double] = { computeSoftAssignments(point.toBreeze.toDenseVector, gaussians, weights, k) } @@ -133,9 +131,7 @@ class GaussianMixtureModel( } } -/** - * @since 1.4.0 - */ +@Since("1.4.0") @Experimental object GaussianMixtureModel extends Loader[GaussianMixtureModel] { @@ -186,9 +182,7 @@ object GaussianMixtureModel extends Loader[GaussianMixtureModel] { } } - /** - * @since 1.4.0 - */ + @Since("1.4.0") override def load(sc: SparkContext, path: String) : GaussianMixtureModel = { val (loadedClassName, version, metadata) = Loader.loadMetadata(sc, path) implicit val formats = DefaultFormats http://git-wip-us.apache.org/repos/asf/spark/blob/f5b028ed/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala index 9ef6834..3e9545a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala @@ -20,7 +20,7 @@ package org.apache.spark.mllib.clustering import scala.collection.mutable.ArrayBuffer import org.apache.spark.Logging -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.linalg.BLAS.{axpy, scal} import org.apache.spark.mllib.util.MLUtils @@ -49,20 +49,20 @@ class KMeans private ( /** * Constructs a KMeans instance with default parameters: {k: 2, maxIterations: 20, runs: 1, * initializationMode: "k-means||", initializationSteps: 5, epsilon: 1e-4, seed: random}. - * @since 0.8.0 */ + @Since("0.8.0") def this() = this(2, 20, 1, KMeans.K_MEANS_PARALLEL, 5, 1e-4, Utils.random.nextLong()) /** * Number of clusters to create (k). - * @since 1.4.0 */ + @Since("1.4.0") def getK: Int = k /** * Set the number of clusters to create (k). Default: 2. - * @since 0.8.0 */ + @Since("0.8.0") def setK(k: Int): this.type = { this.k = k this @@ -70,14 +70,14 @@ class KMeans private ( /** * Maximum number of iterations to run. - * @since 1.4.0 */ + @Since("1.4.0") def getMaxIterations: Int = maxIterations /** * Set maximum number of iterations to run. Default: 20. - * @since 0.8.0 */ + @Since("0.8.0") def setMaxIterations(maxIterations: Int): this.type = { this.maxIterations = maxIterations this @@ -85,16 +85,16 @@ class KMeans private ( /** * The initialization algorithm. This can be either "random" or "k-means||". - * @since 1.4.0 */ + @Since("1.4.0") def getInitializationMode: String = initializationMode /** * Set the initialization algorithm. This can be either "random" to choose random points as * initial cluster centers, or "k-means||" to use a parallel variant of k-means++ * (Bahmani et al., Scalable K-Means++, VLDB 2012). Default: k-means||. - * @since 0.8.0 */ + @Since("0.8.0") def setInitializationMode(initializationMode: String): this.type = { KMeans.validateInitMode(initializationMode) this.initializationMode = initializationMode @@ -104,8 +104,8 @@ class KMeans private ( /** * :: Experimental :: * Number of runs of the algorithm to execute in parallel. - * @since 1.4.0 */ + @Since("1.4.0") @Experimental def getRuns: Int = runs @@ -114,8 +114,8 @@ class KMeans private ( * Set the number of runs of the algorithm to execute in parallel. We initialize the algorithm * this many times with random starting conditions (configured by the initialization mode), then * return the best clustering found over any run. Default: 1. - * @since 0.8.0 */ + @Since("0.8.0") @Experimental def setRuns(runs: Int): this.type = { if (runs <= 0) { @@ -127,15 +127,15 @@ class KMeans private ( /** * Number of steps for the k-means|| initialization mode - * @since 1.4.0 */ + @Since("1.4.0") def getInitializationSteps: Int = initializationSteps /** * Set the number of steps for the k-means|| initialization mode. This is an advanced * setting -- the default of 5 is almost always enough. Default: 5. - * @since 0.8.0 */ + @Since("0.8.0") def setInitializationSteps(initializationSteps: Int): this.type = { if (initializationSteps <= 0) { throw new IllegalArgumentException("Number of initialization steps must be positive") @@ -146,15 +146,15 @@ class KMeans private ( /** * The distance threshold within which we've consider centers to have converged. - * @since 1.4.0 */ + @Since("1.4.0") def getEpsilon: Double = epsilon /** * Set the distance threshold within which we've consider centers to have converged. * If all centers move less than this Euclidean distance, we stop iterating one run. - * @since 0.8.0 */ + @Since("0.8.0") def setEpsilon(epsilon: Double): this.type = { this.epsilon = epsilon this @@ -162,14 +162,14 @@ class KMeans private ( /** * The random seed for cluster initialization. - * @since 1.4.0 */ + @Since("1.4.0") def getSeed: Long = seed /** * Set the random seed for cluster initialization. - * @since 1.4.0 */ + @Since("1.4.0") def setSeed(seed: Long): this.type = { this.seed = seed this @@ -183,8 +183,8 @@ class KMeans private ( * Set the initial starting point, bypassing the random initialization or k-means|| * The condition model.k == this.k must be met, failure results * in an IllegalArgumentException. - * @since 1.4.0 */ + @Since("1.4.0") def setInitialModel(model: KMeansModel): this.type = { require(model.k == k, "mismatched cluster count") initialModel = Some(model) @@ -194,8 +194,8 @@ class KMeans private ( /** * Train a K-means model on the given set of points; `data` should be cached for high * performance, because this is an iterative algorithm. - * @since 0.8.0 */ + @Since("0.8.0") def run(data: RDD[Vector]): KMeansModel = { if (data.getStorageLevel == StorageLevel.NONE) { @@ -453,14 +453,14 @@ class KMeans private ( /** * Top-level methods for calling K-means clustering. - * @since 0.8.0 */ +@Since("0.8.0") object KMeans { // Initialization mode names - /** @since 0.8.0 */ + @Since("0.8.0") val RANDOM = "random" - /** @since 0.8.0 */ + @Since("0.8.0") val K_MEANS_PARALLEL = "k-means||" /** @@ -472,8 +472,8 @@ object KMeans { * @param runs number of parallel runs, defaults to 1. The best model is returned. * @param initializationMode initialization model, either "random" or "k-means||" (default). * @param seed random seed value for cluster initialization - * @since 1.3.0 */ + @Since("1.3.0") def train( data: RDD[Vector], k: Int, @@ -497,8 +497,8 @@ object KMeans { * @param maxIterations max number of iterations * @param runs number of parallel runs, defaults to 1. The best model is returned. * @param initializationMode initialization model, either "random" or "k-means||" (default). - * @since 0.8.0 */ + @Since("0.8.0") def train( data: RDD[Vector], k: Int, @@ -514,8 +514,8 @@ object KMeans { /** * Trains a k-means model using specified parameters and the default values for unspecified. - * @since 0.8.0 */ + @Since("0.8.0") def train( data: RDD[Vector], k: Int, @@ -525,8 +525,8 @@ object KMeans { /** * Trains a k-means model using specified parameters and the default values for unspecified. - * @since 0.8.0 */ + @Since("0.8.0") def train( data: RDD[Vector], k: Int, http://git-wip-us.apache.org/repos/asf/spark/blob/f5b028ed/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala index 8de2087..e425ecd 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala @@ -23,6 +23,7 @@ import org.json4s._ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ +import org.apache.spark.annotation.Since import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.pmml.PMMLExportable @@ -34,35 +35,35 @@ import org.apache.spark.sql.Row /** * A clustering model for K-means. Each point belongs to the cluster with the closest center. - * @since 0.8.0 */ +@Since("0.8.0") class KMeansModel ( val clusterCenters: Array[Vector]) extends Saveable with Serializable with PMMLExportable { /** * A Java-friendly constructor that takes an Iterable of Vectors. - * @since 1.4.0 */ + @Since("1.4.0") def this(centers: java.lang.Iterable[Vector]) = this(centers.asScala.toArray) /** * Total number of clusters. - * @since 0.8.0 */ + @Since("0.8.0") def k: Int = clusterCenters.length /** * Returns the cluster index that a given point belongs to. - * @since 0.8.0 */ + @Since("0.8.0") def predict(point: Vector): Int = { KMeans.findClosest(clusterCentersWithNorm, new VectorWithNorm(point))._1 } /** * Maps given points to their cluster indices. - * @since 1.0.0 */ + @Since("1.0.0") def predict(points: RDD[Vector]): RDD[Int] = { val centersWithNorm = clusterCentersWithNorm val bcCentersWithNorm = points.context.broadcast(centersWithNorm) @@ -71,16 +72,16 @@ class KMeansModel ( /** * Maps given points to their cluster indices. - * @since 1.0.0 */ + @Since("1.0.0") def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] = predict(points.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Integer]] /** * Return the K-means cost (sum of squared distances of points to their nearest center) for this * model on the given data. - * @since 0.8.0 */ + @Since("0.8.0") def computeCost(data: RDD[Vector]): Double = { val centersWithNorm = clusterCentersWithNorm val bcCentersWithNorm = data.context.broadcast(centersWithNorm) @@ -90,9 +91,7 @@ class KMeansModel ( private def clusterCentersWithNorm: Iterable[VectorWithNorm] = clusterCenters.map(new VectorWithNorm(_)) - /** - * @since 1.4.0 - */ + @Since("1.4.0") override def save(sc: SparkContext, path: String): Unit = { KMeansModel.SaveLoadV1_0.save(sc, this, path) } @@ -100,14 +99,10 @@ class KMeansModel ( override protected def formatVersion: String = "1.0" } -/** - * @since 1.4.0 - */ +@Since("1.4.0") object KMeansModel extends Loader[KMeansModel] { - /** - * @since 1.4.0 - */ + @Since("1.4.0") override def load(sc: SparkContext, path: String): KMeansModel = { KMeansModel.SaveLoadV1_0.load(sc, path) } http://git-wip-us.apache.org/repos/asf/spark/blob/f5b028ed/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala index 2a8c6ac..92a321a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala @@ -20,7 +20,7 @@ package org.apache.spark.mllib.clustering import breeze.linalg.{DenseVector => BDV} import org.apache.spark.Logging -import org.apache.spark.annotation.{DeveloperApi, Experimental} +import org.apache.spark.annotation.{DeveloperApi, Experimental, Since} import org.apache.spark.api.java.JavaPairRDD import org.apache.spark.graphx._ import org.apache.spark.mllib.linalg.{Vector, Vectors} @@ -43,8 +43,8 @@ import org.apache.spark.util.Utils * * @see [[http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation Latent Dirichlet allocation * (Wikipedia)]] - * @since 1.3.0 */ +@Since("1.3.0") @Experimental class LDA private ( private var k: Int, @@ -57,8 +57,8 @@ class LDA private ( /** * Constructs a LDA instance with default parameters. - * @since 1.3.0 */ + @Since("1.3.0") def this() = this(k = 10, maxIterations = 20, docConcentration = Vectors.dense(-1), topicConcentration = -1, seed = Utils.random.nextLong(), checkpointInterval = 10, ldaOptimizer = new EMLDAOptimizer) @@ -66,15 +66,15 @@ class LDA private ( /** * Number of topics to infer. I.e., the number of soft cluster centers. * - * @since 1.3.0 */ + @Since("1.3.0") def getK: Int = k /** * Number of topics to infer. I.e., the number of soft cluster centers. * (default = 10) - * @since 1.3.0 */ + @Since("1.3.0") def setK(k: Int): this.type = { require(k > 0, s"LDA k (number of clusters) must be > 0, but was set to $k") this.k = k @@ -86,8 +86,8 @@ class LDA private ( * distributions over topics ("theta"). * * This is the parameter to a Dirichlet distribution. - * @since 1.5.0 */ + @Since("1.5.0") def getAsymmetricDocConcentration: Vector = this.docConcentration /** @@ -96,8 +96,8 @@ class LDA private ( * * This method assumes the Dirichlet distribution is symmetric and can be described by a single * [[Double]] parameter. It should fail if docConcentration is asymmetric. - * @since 1.3.0 */ + @Since("1.3.0") def getDocConcentration: Double = { val parameter = docConcentration(0) if (docConcentration.size == 1) { @@ -131,8 +131,8 @@ class LDA private ( * - Values should be >= 0 * - default = uniformly (1.0 / k), following the implementation from * [[https://github.com/Blei-Lab/onlineldavb]]. - * @since 1.5.0 */ + @Since("1.5.0") def setDocConcentration(docConcentration: Vector): this.type = { require(docConcentration.size > 0, "docConcentration must have > 0 elements") this.docConcentration = docConcentration @@ -141,8 +141,8 @@ class LDA private ( /** * Replicates a [[Double]] docConcentration to create a symmetric prior. - * @since 1.3.0 */ + @Since("1.3.0") def setDocConcentration(docConcentration: Double): this.type = { this.docConcentration = Vectors.dense(docConcentration) this @@ -150,26 +150,26 @@ class LDA private ( /** * Alias for [[getAsymmetricDocConcentration]] - * @since 1.5.0 */ + @Since("1.5.0") def getAsymmetricAlpha: Vector = getAsymmetricDocConcentration /** * Alias for [[getDocConcentration]] - * @since 1.3.0 */ + @Since("1.3.0") def getAlpha: Double = getDocConcentration /** * Alias for [[setDocConcentration()]] - * @since 1.5.0 */ + @Since("1.5.0") def setAlpha(alpha: Vector): this.type = setDocConcentration(alpha) /** * Alias for [[setDocConcentration()]] - * @since 1.3.0 */ + @Since("1.3.0") def setAlpha(alpha: Double): this.type = setDocConcentration(alpha) /** @@ -180,8 +180,8 @@ class LDA private ( * * Note: The topics' distributions over terms are called "beta" in the original LDA paper * by Blei et al., but are called "phi" in many later papers such as Asuncion et al., 2009. - * @since 1.3.0 */ + @Since("1.3.0") def getTopicConcentration: Double = this.topicConcentration /** @@ -205,8 +205,8 @@ class LDA private ( * - Value should be >= 0 * - default = (1.0 / k), following the implementation from * [[https://github.com/Blei-Lab/onlineldavb]]. - * @since 1.3.0 */ + @Since("1.3.0") def setTopicConcentration(topicConcentration: Double): this.type = { this.topicConcentration = topicConcentration this @@ -214,27 +214,27 @@ class LDA private ( /** * Alias for [[getTopicConcentration]] - * @since 1.3.0 */ + @Since("1.3.0") def getBeta: Double = getTopicConcentration /** * Alias for [[setTopicConcentration()]] - * @since 1.3.0 */ + @Since("1.3.0") def setBeta(beta: Double): this.type = setTopicConcentration(beta) /** * Maximum number of iterations for learning. - * @since 1.3.0 */ + @Since("1.3.0") def getMaxIterations: Int = maxIterations /** * Maximum number of iterations for learning. * (default = 20) - * @since 1.3.0 */ + @Since("1.3.0") def setMaxIterations(maxIterations: Int): this.type = { this.maxIterations = maxIterations this @@ -242,14 +242,14 @@ class LDA private ( /** * Random seed - * @since 1.3.0 */ + @Since("1.3.0") def getSeed: Long = seed /** * Random seed - * @since 1.3.0 */ + @Since("1.3.0") def setSeed(seed: Long): this.type = { this.seed = seed this @@ -257,8 +257,8 @@ class LDA private ( /** * Period (in iterations) between checkpoints. - * @since 1.3.0 */ + @Since("1.3.0") def getCheckpointInterval: Int = checkpointInterval /** @@ -268,8 +268,8 @@ class LDA private ( * [[org.apache.spark.SparkContext]], this setting is ignored. * * @see [[org.apache.spark.SparkContext#setCheckpointDir]] - * @since 1.3.0 */ + @Since("1.3.0") def setCheckpointInterval(checkpointInterval: Int): this.type = { this.checkpointInterval = checkpointInterval this @@ -280,8 +280,8 @@ class LDA private ( * :: DeveloperApi :: * * LDAOptimizer used to perform the actual calculation - * @since 1.4.0 */ + @Since("1.4.0") @DeveloperApi def getOptimizer: LDAOptimizer = ldaOptimizer @@ -289,8 +289,8 @@ class LDA private ( * :: DeveloperApi :: * * LDAOptimizer used to perform the actual calculation (default = EMLDAOptimizer) - * @since 1.4.0 */ + @Since("1.4.0") @DeveloperApi def setOptimizer(optimizer: LDAOptimizer): this.type = { this.ldaOptimizer = optimizer @@ -300,8 +300,8 @@ class LDA private ( /** * Set the LDAOptimizer used to perform the actual calculation by algorithm name. * Currently "em", "online" are supported. - * @since 1.4.0 */ + @Since("1.4.0") def setOptimizer(optimizerName: String): this.type = { this.ldaOptimizer = optimizerName.toLowerCase match { @@ -321,8 +321,8 @@ class LDA private ( * (where the vocabulary size is the length of the vector). * Document IDs must be unique and >= 0. * @return Inferred LDA model - * @since 1.3.0 */ + @Since("1.3.0") def run(documents: RDD[(Long, Vector)]): LDAModel = { val state = ldaOptimizer.initialize(documents, this) var iter = 0 @@ -339,8 +339,8 @@ class LDA private ( /** * Java-friendly version of [[run()]] - * @since 1.3.0 */ + @Since("1.3.0") def run(documents: JavaPairRDD[java.lang.Long, Vector]): LDAModel = { run(documents.rdd.asInstanceOf[RDD[(Long, Vector)]]) } http://git-wip-us.apache.org/repos/asf/spark/blob/f5b028ed/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index 6bc68a4..667374a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -25,7 +25,7 @@ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.{JavaPairRDD, JavaRDD} import org.apache.spark.graphx.{Edge, EdgeContext, Graph, VertexId} import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors} @@ -192,24 +192,16 @@ class LocalLDAModel private[clustering] ( override protected[clustering] val gammaShape: Double = 100) extends LDAModel with Serializable { - /** - * @since 1.3.0 - */ + @Since("1.3.0") override def k: Int = topics.numCols - /** - * @since 1.3.0 - */ + @Since("1.3.0") override def vocabSize: Int = topics.numRows - /** - * @since 1.3.0 - */ + @Since("1.3.0") override def topicsMatrix: Matrix = topics - /** - * @since 1.3.0 - */ + @Since("1.3.0") override def describeTopics(maxTermsPerTopic: Int): Array[(Array[Int], Array[Double])] = { val brzTopics = topics.toBreeze.toDenseMatrix Range(0, k).map { topicIndex => @@ -222,9 +214,7 @@ class LocalLDAModel private[clustering] ( override protected def formatVersion = "1.0" - /** - * @since 1.5.0 - */ + @Since("1.5.0") override def save(sc: SparkContext, path: String): Unit = { LocalLDAModel.SaveLoadV1_0.save(sc, path, topicsMatrix, docConcentration, topicConcentration, gammaShape) @@ -238,16 +228,16 @@ class LocalLDAModel private[clustering] ( * * @param documents test corpus to use for calculating log likelihood * @return variational lower bound on the log likelihood of the entire corpus - * @since 1.5.0 */ + @Since("1.5.0") def logLikelihood(documents: RDD[(Long, Vector)]): Double = logLikelihoodBound(documents, docConcentration, topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, gammaShape, k, vocabSize) /** * Java-friendly version of [[logLikelihood]] - * @since 1.5.0 */ + @Since("1.5.0") def logLikelihood(documents: JavaPairRDD[java.lang.Long, Vector]): Double = { logLikelihood(documents.rdd.asInstanceOf[RDD[(Long, Vector)]]) } @@ -258,8 +248,8 @@ class LocalLDAModel private[clustering] ( * * @param documents test corpus to use for calculating perplexity * @return Variational upper bound on log perplexity per token. - * @since 1.5.0 */ + @Since("1.5.0") def logPerplexity(documents: RDD[(Long, Vector)]): Double = { val corpusTokenCount = documents .map { case (_, termCounts) => termCounts.toArray.sum } @@ -267,9 +257,8 @@ class LocalLDAModel private[clustering] ( -logLikelihood(documents) / corpusTokenCount } - /** Java-friendly version of [[logPerplexity]] - * @since 1.5.0 - */ + /** Java-friendly version of [[logPerplexity]] */ + @Since("1.5.0") def logPerplexity(documents: JavaPairRDD[java.lang.Long, Vector]): Double = { logPerplexity(documents.rdd.asInstanceOf[RDD[(Long, Vector)]]) } @@ -347,8 +336,8 @@ class LocalLDAModel private[clustering] ( * for each document. * @param documents documents to predict topic mixture distributions for * @return An RDD of (document ID, topic mixture distribution for document) - * @since 1.3.0 */ + @Since("1.3.0") // TODO: declare in LDAModel and override once implemented in DistributedLDAModel def topicDistributions(documents: RDD[(Long, Vector)]): RDD[(Long, Vector)] = { // Double transpose because dirichletExpectation normalizes by row and we need to normalize @@ -376,8 +365,8 @@ class LocalLDAModel private[clustering] ( /** * Java-friendly version of [[topicDistributions]] - * @since 1.4.1 */ + @Since("1.4.1") def topicDistributions( documents: JavaPairRDD[java.lang.Long, Vector]): JavaPairRDD[java.lang.Long, Vector] = { val distributions = topicDistributions(documents.rdd.asInstanceOf[RDD[(Long, Vector)]]) @@ -451,9 +440,7 @@ object LocalLDAModel extends Loader[LocalLDAModel] { } } - /** - * @since 1.5.0 - */ + @Since("1.5.0") override def load(sc: SparkContext, path: String): LocalLDAModel = { val (loadedClassName, loadedVersion, metadata) = Loader.loadMetadata(sc, path) implicit val formats = DefaultFormats @@ -510,8 +497,8 @@ class DistributedLDAModel private[clustering] ( * Convert model to a local model. * The local model stores the inferred topics but not the topic distributions for training * documents. - * @since 1.3.0 */ + @Since("1.3.0") def toLocal: LocalLDAModel = new LocalLDAModel(topicsMatrix, docConcentration, topicConcentration, gammaShape) @@ -521,8 +508,8 @@ class DistributedLDAModel private[clustering] ( * No guarantees are given about the ordering of the topics. * * WARNING: This matrix is collected from an RDD. Beware memory usage when vocabSize, k are large. - * @since 1.3.0 */ + @Since("1.3.0") override lazy val topicsMatrix: Matrix = { // Collect row-major topics val termTopicCounts: Array[(Int, TopicCounts)] = @@ -541,9 +528,7 @@ class DistributedLDAModel private[clustering] ( Matrices.fromBreeze(brzTopics) } - /** - * @since 1.3.0 - */ + @Since("1.3.0") override def describeTopics(maxTermsPerTopic: Int): Array[(Array[Int], Array[Double])] = { val numTopics = k // Note: N_k is not needed to find the top terms, but it is needed to normalize weights @@ -582,8 +567,8 @@ class DistributedLDAModel private[clustering] ( * @return Array over topics. Each element represent as a pair of matching arrays: * (IDs for the documents, weights of the topic in these documents). * For each topic, documents are sorted in order of decreasing topic weights. - * @since 1.5.0 */ + @Since("1.5.0") def topDocumentsPerTopic(maxDocumentsPerTopic: Int): Array[(Array[Long], Array[Double])] = { val numTopics = k val topicsInQueues: Array[BoundedPriorityQueue[(Double, Long)]] = @@ -666,8 +651,8 @@ class DistributedLDAModel private[clustering] ( * - This excludes the prior; for that, use [[logPrior]]. * - Even with [[logPrior]], this is NOT the same as the data log likelihood given the * hyperparameters. - * @since 1.3.0 */ + @Since("1.3.0") lazy val logLikelihood: Double = { // TODO: generalize this for asymmetric (non-scalar) alpha val alpha = this.docConcentration(0) // To avoid closure capture of enclosing object @@ -693,8 +678,8 @@ class DistributedLDAModel private[clustering] ( /** * Log probability of the current parameter estimate: * log P(topics, topic distributions for docs | alpha, eta) - * @since 1.3.0 */ + @Since("1.3.0") lazy val logPrior: Double = { // TODO: generalize this for asymmetric (non-scalar) alpha val alpha = this.docConcentration(0) // To avoid closure capture of enclosing object @@ -725,8 +710,8 @@ class DistributedLDAModel private[clustering] ( * ("theta_doc"). * * @return RDD of (document ID, topic distribution) pairs - * @since 1.3.0 */ + @Since("1.3.0") def topicDistributions: RDD[(Long, Vector)] = { graph.vertices.filter(LDA.isDocumentVertex).map { case (docID, topicCounts) => (docID.toLong, Vectors.fromBreeze(normalize(topicCounts, 1.0))) @@ -735,8 +720,8 @@ class DistributedLDAModel private[clustering] ( /** * Java-friendly version of [[topicDistributions]] - * @since 1.4.1 */ + @Since("1.4.1") def javaTopicDistributions: JavaPairRDD[java.lang.Long, Vector] = { JavaPairRDD.fromRDD(topicDistributions.asInstanceOf[RDD[(java.lang.Long, Vector)]]) } @@ -744,8 +729,8 @@ class DistributedLDAModel private[clustering] ( /** * For each document, return the top k weighted topics for that document and their weights. * @return RDD of (doc ID, topic indices, topic weights) - * @since 1.5.0 */ + @Since("1.5.0") def topTopicsPerDocument(k: Int): RDD[(Long, Array[Int], Array[Double])] = { graph.vertices.filter(LDA.isDocumentVertex).map { case (docID, topicCounts) => val topIndices = argtopk(topicCounts, k) @@ -761,8 +746,8 @@ class DistributedLDAModel private[clustering] ( /** * Java-friendly version of [[topTopicsPerDocument]] - * @since 1.5.0 */ + @Since("1.5.0") def javaTopTopicsPerDocument(k: Int): JavaRDD[(java.lang.Long, Array[Int], Array[Double])] = { val topics = topTopicsPerDocument(k) topics.asInstanceOf[RDD[(java.lang.Long, Array[Int], Array[Double])]].toJavaRDD() @@ -775,8 +760,8 @@ class DistributedLDAModel private[clustering] ( /** * Java-friendly version of [[topicDistributions]] - * @since 1.5.0 */ + @Since("1.5.0") override def save(sc: SparkContext, path: String): Unit = { DistributedLDAModel.SaveLoadV1_0.save( sc, path, graph, globalTopicTotals, k, vocabSize, docConcentration, topicConcentration, @@ -877,9 +862,7 @@ object DistributedLDAModel extends Loader[DistributedLDAModel] { } - /** - * @since 1.5.0 - */ + @Since("1.5.0") override def load(sc: SparkContext, path: String): DistributedLDAModel = { val (loadedClassName, loadedVersion, metadata) = Loader.loadMetadata(sc, path) implicit val formats = DefaultFormats http://git-wip-us.apache.org/repos/asf/spark/blob/f5b028ed/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala index cb517f9..5c2aae6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala @@ -23,7 +23,7 @@ import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, all, normalize, su import breeze.numerics.{trigamma, abs, exp} import breeze.stats.distributions.{Gamma, RandBasis} -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.graphx._ import org.apache.spark.graphx.impl.GraphImpl import org.apache.spark.mllib.impl.PeriodicGraphCheckpointer @@ -35,8 +35,8 @@ import org.apache.spark.rdd.RDD * * An LDAOptimizer specifies which optimization/learning/inference algorithm to use, and it can * hold optimizer-specific parameters for users to set. - * @since 1.4.0 */ +@Since("1.4.0") @DeveloperApi sealed trait LDAOptimizer { @@ -74,8 +74,8 @@ sealed trait LDAOptimizer { * - Paper which clearly explains several algorithms, including EM: * Asuncion, Welling, Smyth, and Teh. * "On Smoothing and Inference for Topic Models." UAI, 2009. - * @since 1.4.0 */ +@Since("1.4.0") @DeveloperApi final class EMLDAOptimizer extends LDAOptimizer { @@ -226,8 +226,8 @@ final class EMLDAOptimizer extends LDAOptimizer { * * Original Online LDA paper: * Hoffman, Blei and Bach, "Online Learning for Latent Dirichlet Allocation." NIPS, 2010. - * @since 1.4.0 */ +@Since("1.4.0") @DeveloperApi final class OnlineLDAOptimizer extends LDAOptimizer { @@ -276,16 +276,16 @@ final class OnlineLDAOptimizer extends LDAOptimizer { /** * A (positive) learning parameter that downweights early iterations. Larger values make early * iterations count less. - * @since 1.4.0 */ + @Since("1.4.0") def getTau0: Double = this.tau0 /** * A (positive) learning parameter that downweights early iterations. Larger values make early * iterations count less. * Default: 1024, following the original Online LDA paper. - * @since 1.4.0 */ + @Since("1.4.0") def setTau0(tau0: Double): this.type = { require(tau0 > 0, s"LDA tau0 must be positive, but was set to $tau0") this.tau0 = tau0 @@ -294,16 +294,16 @@ final class OnlineLDAOptimizer extends LDAOptimizer { /** * Learning rate: exponential decay rate - * @since 1.4.0 */ + @Since("1.4.0") def getKappa: Double = this.kappa /** * Learning rate: exponential decay rate---should be between * (0.5, 1.0] to guarantee asymptotic convergence. * Default: 0.51, based on the original Online LDA paper. - * @since 1.4.0 */ + @Since("1.4.0") def setKappa(kappa: Double): this.type = { require(kappa >= 0, s"Online LDA kappa must be nonnegative, but was set to $kappa") this.kappa = kappa @@ -312,8 +312,8 @@ final class OnlineLDAOptimizer extends LDAOptimizer { /** * Mini-batch fraction, which sets the fraction of document sampled and used in each iteration - * @since 1.4.0 */ + @Since("1.4.0") def getMiniBatchFraction: Double = this.miniBatchFraction /** @@ -325,8 +325,8 @@ final class OnlineLDAOptimizer extends LDAOptimizer { * maxIterations * miniBatchFraction >= 1. * * Default: 0.05, i.e., 5% of total documents. - * @since 1.4.0 */ + @Since("1.4.0") def setMiniBatchFraction(miniBatchFraction: Double): this.type = { require(miniBatchFraction > 0.0 && miniBatchFraction <= 1.0, s"Online LDA miniBatchFraction must be in range (0,1], but was set to $miniBatchFraction") @@ -337,16 +337,16 @@ final class OnlineLDAOptimizer extends LDAOptimizer { /** * Optimize alpha, indicates whether alpha (Dirichlet parameter for document-topic distribution) * will be optimized during training. - * @since 1.5.0 */ + @Since("1.5.0") def getOptimzeAlpha: Boolean = this.optimizeAlpha /** * Sets whether to optimize alpha parameter during training. * * Default: false - * @since 1.5.0 */ + @Since("1.5.0") def setOptimzeAlpha(optimizeAlpha: Boolean): this.type = { this.optimizeAlpha = optimizeAlpha this http://git-wip-us.apache.org/repos/asf/spark/blob/f5b028ed/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala index b4733ca..396b36f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala @@ -21,7 +21,7 @@ import org.json4s.JsonDSL._ import org.json4s._ import org.json4s.jackson.JsonMethods._ -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.JavaRDD import org.apache.spark.graphx._ import org.apache.spark.graphx.impl.GraphImpl @@ -39,16 +39,14 @@ import org.apache.spark.{Logging, SparkContext, SparkException} * * @param k number of clusters * @param assignments an RDD of clustering [[PowerIterationClustering#Assignment]]s - * @since 1.3.0 */ +@Since("1.3.0") @Experimental class PowerIterationClusteringModel( val k: Int, val assignments: RDD[PowerIterationClustering.Assignment]) extends Saveable with Serializable { - /** - * @since 1.4.0 - */ + @Since("1.4.0") override def save(sc: SparkContext, path: String): Unit = { PowerIterationClusteringModel.SaveLoadV1_0.save(sc, this, path) } @@ -56,9 +54,7 @@ class PowerIterationClusteringModel( override protected def formatVersion: String = "1.0" } -/** - * @since 1.4.0 - */ +@Since("1.4.0") object PowerIterationClusteringModel extends Loader[PowerIterationClusteringModel] { override def load(sc: SparkContext, path: String): PowerIterationClusteringModel = { PowerIterationClusteringModel.SaveLoadV1_0.load(sc, path) @@ -73,8 +69,8 @@ object PowerIterationClusteringModel extends Loader[PowerIterationClusteringMode val thisClassName = "org.apache.spark.mllib.clustering.PowerIterationClusteringModel" /** - * @since 1.4.0 */ + @Since("1.4.0") def save(sc: SparkContext, model: PowerIterationClusteringModel, path: String): Unit = { val sqlContext = new SQLContext(sc) import sqlContext.implicits._ @@ -87,9 +83,7 @@ object PowerIterationClusteringModel extends Loader[PowerIterationClusteringMode dataRDD.write.parquet(Loader.dataPath(path)) } - /** - * @since 1.4.0 - */ + @Since("1.4.0") def load(sc: SparkContext, path: String): PowerIterationClusteringModel = { implicit val formats = DefaultFormats val sqlContext = new SQLContext(sc) @@ -136,14 +130,14 @@ class PowerIterationClustering private[clustering] ( /** * Constructs a PIC instance with default parameters: {k: 2, maxIterations: 100, * initMode: "random"}. - * @since 1.3.0 */ + @Since("1.3.0") def this() = this(k = 2, maxIterations = 100, initMode = "random") /** * Set the number of clusters. - * @since 1.3.0 */ + @Since("1.3.0") def setK(k: Int): this.type = { this.k = k this @@ -151,8 +145,8 @@ class PowerIterationClustering private[clustering] ( /** * Set maximum number of iterations of the power iteration loop - * @since 1.3.0 */ + @Since("1.3.0") def setMaxIterations(maxIterations: Int): this.type = { this.maxIterations = maxIterations this @@ -161,8 +155,8 @@ class PowerIterationClustering private[clustering] ( /** * Set the initialization mode. This can be either "random" to use a random vector * as vertex properties, or "degree" to use normalized sum similarities. Default: random. - * @since 1.3.0 */ + @Since("1.3.0") def setInitializationMode(mode: String): this.type = { this.initMode = mode match { case "random" | "degree" => mode @@ -182,8 +176,8 @@ class PowerIterationClustering private[clustering] ( * assume s,,ij,, = 0.0. * * @return a [[PowerIterationClusteringModel]] that contains the clustering result - * @since 1.5.0 */ + @Since("1.5.0") def run(graph: Graph[Double, Double]): PowerIterationClusteringModel = { val w = normalize(graph) val w0 = initMode match { @@ -204,8 +198,8 @@ class PowerIterationClustering private[clustering] ( * assume s,,ij,, = 0.0. * * @return a [[PowerIterationClusteringModel]] that contains the clustering result - * @since 1.3.0 */ + @Since("1.3.0") def run(similarities: RDD[(Long, Long, Double)]): PowerIterationClusteringModel = { val w = normalize(similarities) val w0 = initMode match { @@ -217,8 +211,8 @@ class PowerIterationClustering private[clustering] ( /** * A Java-friendly version of [[PowerIterationClustering.run]]. - * @since 1.3.0 */ + @Since("1.3.0") def run(similarities: JavaRDD[(java.lang.Long, java.lang.Long, java.lang.Double)]) : PowerIterationClusteringModel = { run(similarities.rdd.asInstanceOf[RDD[(Long, Long, Double)]]) @@ -242,9 +236,7 @@ class PowerIterationClustering private[clustering] ( } } -/** - * @since 1.3.0 - */ +@Since("1.3.0") @Experimental object PowerIterationClustering extends Logging { @@ -253,8 +245,8 @@ object PowerIterationClustering extends Logging { * Cluster assignment. * @param id node id * @param cluster assigned cluster id - * @since 1.3.0 */ + @Since("1.3.0") @Experimental case class Assignment(id: Long, cluster: Int) http://git-wip-us.apache.org/repos/asf/spark/blob/f5b028ed/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala index a915804..41f2668 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala @@ -20,7 +20,7 @@ package org.apache.spark.mllib.clustering import scala.reflect.ClassTag import org.apache.spark.Logging -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors} import org.apache.spark.rdd.RDD @@ -63,9 +63,8 @@ import org.apache.spark.util.random.XORShiftRandom * such that at time t + h the discount applied to the data from t is 0.5. * The definition remains the same whether the time unit is given * as batches or points. - * @since 1.2.0 - * */ +@Since("1.2.0") @Experimental class StreamingKMeansModel( override val clusterCenters: Array[Vector], @@ -73,8 +72,8 @@ class StreamingKMeansModel( /** * Perform a k-means update on a batch of data. - * @since 1.2.0 */ + @Since("1.2.0") def update(data: RDD[Vector], decayFactor: Double, timeUnit: String): StreamingKMeansModel = { // find nearest cluster to each point @@ -166,23 +165,23 @@ class StreamingKMeansModel( * .setRandomCenters(5, 100.0) * .trainOn(DStream) * }}} - * @since 1.2.0 */ +@Since("1.2.0") @Experimental class StreamingKMeans( var k: Int, var decayFactor: Double, var timeUnit: String) extends Logging with Serializable { - /** @since 1.2.0 */ + @Since("1.2.0") def this() = this(2, 1.0, StreamingKMeans.BATCHES) protected var model: StreamingKMeansModel = new StreamingKMeansModel(null, null) /** * Set the number of clusters. - * @since 1.2.0 */ + @Since("1.2.0") def setK(k: Int): this.type = { this.k = k this @@ -190,8 +189,8 @@ class StreamingKMeans( /** * Set the decay factor directly (for forgetful algorithms). - * @since 1.2.0 */ + @Since("1.2.0") def setDecayFactor(a: Double): this.type = { this.decayFactor = a this @@ -199,8 +198,8 @@ class StreamingKMeans( /** * Set the half life and time unit ("batches" or "points") for forgetful algorithms. - * @since 1.2.0 */ + @Since("1.2.0") def setHalfLife(halfLife: Double, timeUnit: String): this.type = { if (timeUnit != StreamingKMeans.BATCHES && timeUnit != StreamingKMeans.POINTS) { throw new IllegalArgumentException("Invalid time unit for decay: " + timeUnit) @@ -213,8 +212,8 @@ class StreamingKMeans( /** * Specify initial centers directly. - * @since 1.2.0 */ + @Since("1.2.0") def setInitialCenters(centers: Array[Vector], weights: Array[Double]): this.type = { model = new StreamingKMeansModel(centers, weights) this @@ -226,8 +225,8 @@ class StreamingKMeans( * @param dim Number of dimensions * @param weight Weight for each center * @param seed Random seed - * @since 1.2.0 */ + @Since("1.2.0") def setRandomCenters(dim: Int, weight: Double, seed: Long = Utils.random.nextLong): this.type = { val random = new XORShiftRandom(seed) val centers = Array.fill(k)(Vectors.dense(Array.fill(dim)(random.nextGaussian()))) @@ -238,8 +237,8 @@ class StreamingKMeans( /** * Return the latest model. - * @since 1.2.0 */ + @Since("1.2.0") def latestModel(): StreamingKMeansModel = { model } @@ -251,8 +250,8 @@ class StreamingKMeans( * and updates the model using each batch of data from the stream. * * @param data DStream containing vector data - * @since 1.2.0 */ + @Since("1.2.0") def trainOn(data: DStream[Vector]) { assertInitialized() data.foreachRDD { (rdd, time) => @@ -262,8 +261,8 @@ class StreamingKMeans( /** * Java-friendly version of `trainOn`. - * @since 1.4.0 */ + @Since("1.4.0") def trainOn(data: JavaDStream[Vector]): Unit = trainOn(data.dstream) /** @@ -271,8 +270,8 @@ class StreamingKMeans( * * @param data DStream containing vector data * @return DStream containing predictions - * @since 1.2.0 */ + @Since("1.2.0") def predictOn(data: DStream[Vector]): DStream[Int] = { assertInitialized() data.map(model.predict) @@ -280,8 +279,8 @@ class StreamingKMeans( /** * Java-friendly version of `predictOn`. - * @since 1.4.0 */ + @Since("1.4.0") def predictOn(data: JavaDStream[Vector]): JavaDStream[java.lang.Integer] = { JavaDStream.fromDStream(predictOn(data.dstream).asInstanceOf[DStream[java.lang.Integer]]) } @@ -292,8 +291,8 @@ class StreamingKMeans( * @param data DStream containing (key, feature vector) pairs * @tparam K key type * @return DStream containing the input keys and the predictions as values - * @since 1.2.0 */ + @Since("1.2.0") def predictOnValues[K: ClassTag](data: DStream[(K, Vector)]): DStream[(K, Int)] = { assertInitialized() data.mapValues(model.predict) @@ -301,8 +300,8 @@ class StreamingKMeans( /** * Java-friendly version of `predictOnValues`. - * @since 1.4.0 */ + @Since("1.4.0") def predictOnValues[K]( data: JavaPairDStream[K, Vector]): JavaPairDStream[K, java.lang.Integer] = { implicit val tag = fakeClassTag[K] http://git-wip-us.apache.org/repos/asf/spark/blob/f5b028ed/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala index 486741e..76ae847 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.evaluation -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.Logging import org.apache.spark.SparkContext._ import org.apache.spark.mllib.evaluation.binary._ @@ -41,8 +41,8 @@ import org.apache.spark.sql.DataFrame * of bins may not exactly equal numBins. The last bin in each partition may * be smaller as a result, meaning there may be an extra sample at * partition boundaries. - * @since 1.3.0 */ +@Since("1.3.0") @Experimental class BinaryClassificationMetrics( val scoreAndLabels: RDD[(Double, Double)], @@ -52,8 +52,8 @@ class BinaryClassificationMetrics( /** * Defaults `numBins` to 0. - * @since 1.0.0 */ + @Since("1.0.0") def this(scoreAndLabels: RDD[(Double, Double)]) = this(scoreAndLabels, 0) /** @@ -65,16 +65,16 @@ class BinaryClassificationMetrics( /** * Unpersist intermediate RDDs used in the computation. - * @since 1.0.0 */ + @Since("1.0.0") def unpersist() { cumulativeCounts.unpersist() } /** * Returns thresholds in descending order. - * @since 1.0.0 */ + @Since("1.0.0") def thresholds(): RDD[Double] = cumulativeCounts.map(_._1) /** @@ -82,8 +82,8 @@ class BinaryClassificationMetrics( * which is an RDD of (false positive rate, true positive rate) * with (0.0, 0.0) prepended and (1.0, 1.0) appended to it. * @see http://en.wikipedia.org/wiki/Receiver_operating_characteristic - * @since 1.0.0 */ + @Since("1.0.0") def roc(): RDD[(Double, Double)] = { val rocCurve = createCurve(FalsePositiveRate, Recall) val sc = confusions.context @@ -94,16 +94,16 @@ class BinaryClassificationMetrics( /** * Computes the area under the receiver operating characteristic (ROC) curve. - * @since 1.0.0 */ + @Since("1.0.0") def areaUnderROC(): Double = AreaUnderCurve.of(roc()) /** * Returns the precision-recall curve, which is an RDD of (recall, precision), * NOT (precision, recall), with (0.0, 1.0) prepended to it. * @see http://en.wikipedia.org/wiki/Precision_and_recall - * @since 1.0.0 */ + @Since("1.0.0") def pr(): RDD[(Double, Double)] = { val prCurve = createCurve(Recall, Precision) val sc = confusions.context @@ -113,8 +113,8 @@ class BinaryClassificationMetrics( /** * Computes the area under the precision-recall curve. - * @since 1.0.0 */ + @Since("1.0.0") def areaUnderPR(): Double = AreaUnderCurve.of(pr()) /** @@ -122,26 +122,26 @@ class BinaryClassificationMetrics( * @param beta the beta factor in F-Measure computation. * @return an RDD of (threshold, F-Measure) pairs. * @see http://en.wikipedia.org/wiki/F1_score - * @since 1.0.0 */ + @Since("1.0.0") def fMeasureByThreshold(beta: Double): RDD[(Double, Double)] = createCurve(FMeasure(beta)) /** * Returns the (threshold, F-Measure) curve with beta = 1.0. - * @since 1.0.0 */ + @Since("1.0.0") def fMeasureByThreshold(): RDD[(Double, Double)] = fMeasureByThreshold(1.0) /** * Returns the (threshold, precision) curve. - * @since 1.0.0 */ + @Since("1.0.0") def precisionByThreshold(): RDD[(Double, Double)] = createCurve(Precision) /** * Returns the (threshold, recall) curve. - * @since 1.0.0 */ + @Since("1.0.0") def recallByThreshold(): RDD[(Double, Double)] = createCurve(Recall) private lazy val ( http://git-wip-us.apache.org/repos/asf/spark/blob/f5b028ed/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala index dddfa3e..02e89d9 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala @@ -20,7 +20,7 @@ package org.apache.spark.mllib.evaluation import scala.collection.Map import org.apache.spark.SparkContext._ -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.mllib.linalg.{Matrices, Matrix} import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame @@ -30,8 +30,8 @@ import org.apache.spark.sql.DataFrame * Evaluator for multiclass classification. * * @param predictionAndLabels an RDD of (prediction, label) pairs. - * @since 1.1.0 */ +@Since("1.1.0") @Experimental class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { @@ -65,8 +65,8 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { * predicted classes are in columns, * they are ordered by class label ascending, * as in "labels" - * @since 1.1.0 */ + @Since("1.1.0") def confusionMatrix: Matrix = { val n = labels.size val values = Array.ofDim[Double](n * n) @@ -85,15 +85,15 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { /** * Returns true positive rate for a given label (category) * @param label the label. - * @since 1.1.0 */ + @Since("1.1.0") def truePositiveRate(label: Double): Double = recall(label) /** * Returns false positive rate for a given label (category) * @param label the label. - * @since 1.1.0 */ + @Since("1.1.0") def falsePositiveRate(label: Double): Double = { val fp = fpByClass.getOrElse(label, 0) fp.toDouble / (labelCount - labelCountByClass(label)) @@ -102,8 +102,8 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { /** * Returns precision for a given label (category) * @param label the label. - * @since 1.1.0 */ + @Since("1.1.0") def precision(label: Double): Double = { val tp = tpByClass(label) val fp = fpByClass.getOrElse(label, 0) @@ -113,16 +113,16 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { /** * Returns recall for a given label (category) * @param label the label. - * @since 1.1.0 */ + @Since("1.1.0") def recall(label: Double): Double = tpByClass(label).toDouble / labelCountByClass(label) /** * Returns f-measure for a given label (category) * @param label the label. * @param beta the beta parameter. - * @since 1.1.0 */ + @Since("1.1.0") def fMeasure(label: Double, beta: Double): Double = { val p = precision(label) val r = recall(label) @@ -133,8 +133,8 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { /** * Returns f1-measure for a given label (category) * @param label the label. - * @since 1.1.0 */ + @Since("1.1.0") def fMeasure(label: Double): Double = fMeasure(label, 1.0) /** @@ -187,8 +187,8 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { /** * Returns weighted averaged f-measure * @param beta the beta parameter. - * @since 1.1.0 */ + @Since("1.1.0") def weightedFMeasure(beta: Double): Double = labelCountByClass.map { case (category, count) => fMeasure(category, beta) * count.toDouble / labelCount }.sum http://git-wip-us.apache.org/repos/asf/spark/blob/f5b028ed/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala index 77cb1e0..a0a8d9c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala @@ -17,6 +17,7 @@ package org.apache.spark.mllib.evaluation +import org.apache.spark.annotation.Since import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext._ import org.apache.spark.sql.DataFrame @@ -25,8 +26,8 @@ import org.apache.spark.sql.DataFrame * Evaluator for multilabel classification. * @param predictionAndLabels an RDD of (predictions, labels) pairs, * both are non-null Arrays, each with unique elements. - * @since 1.2.0 */ +@Since("1.2.0") class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])]) { /** @@ -104,8 +105,8 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])] /** * Returns precision for a given label (category) * @param label the label. - * @since 1.2.0 */ + @Since("1.2.0") def precision(label: Double): Double = { val tp = tpPerClass(label) val fp = fpPerClass.getOrElse(label, 0L) @@ -115,8 +116,8 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])] /** * Returns recall for a given label (category) * @param label the label. - * @since 1.2.0 */ + @Since("1.2.0") def recall(label: Double): Double = { val tp = tpPerClass(label) val fn = fnPerClass.getOrElse(label, 0L) @@ -126,8 +127,8 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])] /** * Returns f1-measure for a given label (category) * @param label the label. - * @since 1.2.0 */ + @Since("1.2.0") def f1Measure(label: Double): Double = { val p = precision(label) val r = recall(label) http://git-wip-us.apache.org/repos/asf/spark/blob/f5b028ed/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala index 063fbed..a7f43f0 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala @@ -23,7 +23,7 @@ import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.spark.Logging -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.{JavaSparkContext, JavaRDD} import org.apache.spark.rdd.RDD @@ -34,8 +34,8 @@ import org.apache.spark.rdd.RDD * Java users should use [[RankingMetrics$.of]] to create a [[RankingMetrics]] instance. * * @param predictionAndLabels an RDD of (predicted ranking, ground truth set) pairs. - * @since 1.2.0 */ +@Since("1.2.0") @Experimental class RankingMetrics[T: ClassTag](predictionAndLabels: RDD[(Array[T], Array[T])]) extends Logging with Serializable { @@ -56,8 +56,8 @@ class RankingMetrics[T: ClassTag](predictionAndLabels: RDD[(Array[T], Array[T])] * * @param k the position to compute the truncated precision, must be positive * @return the average precision at the first k ranking positions - * @since 1.2.0 */ + @Since("1.2.0") def precisionAt(k: Int): Double = { require(k > 0, "ranking position k should be positive") predictionAndLabels.map { case (pred, lab) => @@ -126,8 +126,8 @@ class RankingMetrics[T: ClassTag](predictionAndLabels: RDD[(Array[T], Array[T])] * * @param k the position to compute the truncated ndcg, must be positive * @return the average ndcg at the first k ranking positions - * @since 1.2.0 */ + @Since("1.2.0") def ndcgAt(k: Int): Double = { require(k > 0, "ranking position k should be positive") predictionAndLabels.map { case (pred, lab) => @@ -165,8 +165,8 @@ object RankingMetrics { /** * Creates a [[RankingMetrics]] instance (for Java users). * @param predictionAndLabels a JavaRDD of (predicted ranking, ground truth set) pairs - * @since 1.4.0 */ + @Since("1.4.0") def of[E, T <: jl.Iterable[E]](predictionAndLabels: JavaRDD[(T, T)]): RankingMetrics[E] = { implicit val tag = JavaSparkContext.fakeClassTag[E] val rdd = predictionAndLabels.rdd.map { case (predictions, labels) => http://git-wip-us.apache.org/repos/asf/spark/blob/f5b028ed/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala index 54dfd8c..36a6c35 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.evaluation -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.rdd.RDD import org.apache.spark.Logging import org.apache.spark.mllib.linalg.Vectors @@ -29,8 +29,8 @@ import org.apache.spark.sql.DataFrame * Evaluator for regression. * * @param predictionAndObservations an RDD of (prediction, observation) pairs. - * @since 1.2.0 */ +@Since("1.2.0") @Experimental class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extends Logging { @@ -67,8 +67,8 @@ class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extend * Returns the variance explained by regression. * explainedVariance = \sum_i (\hat{y_i} - \bar{y})^2 / n * @see [[https://en.wikipedia.org/wiki/Fraction_of_variance_unexplained]] - * @since 1.2.0 */ + @Since("1.2.0") def explainedVariance: Double = { SSreg / summary.count } @@ -76,8 +76,8 @@ class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extend /** * Returns the mean absolute error, which is a risk function corresponding to the * expected value of the absolute error loss or l1-norm loss. - * @since 1.2.0 */ + @Since("1.2.0") def meanAbsoluteError: Double = { summary.normL1(1) / summary.count } @@ -85,8 +85,8 @@ class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extend /** * Returns the mean squared error, which is a risk function corresponding to the * expected value of the squared error loss or quadratic loss. - * @since 1.2.0 */ + @Since("1.2.0") def meanSquaredError: Double = { SSerr / summary.count } @@ -94,8 +94,8 @@ class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extend /** * Returns the root mean squared error, which is defined as the square root of * the mean squared error. - * @since 1.2.0 */ + @Since("1.2.0") def rootMeanSquaredError: Double = { math.sqrt(this.meanSquaredError) } @@ -103,8 +103,8 @@ class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extend /** * Returns R^2^, the unadjusted coefficient of determination. * @see [[http://en.wikipedia.org/wiki/Coefficient_of_determination]] - * @since 1.2.0 */ + @Since("1.2.0") def r2: Double = { 1 - SSerr / SStot } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org For additional commands, e-mail: commits-help@spark.apache.org