spark-reviews mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mengxr <...@git.apache.org>
Subject [GitHub] spark pull request: [SPARK-3218, SPARK-3219, SPARK-3261, SPARK-342...
Date Mon, 06 Oct 2014 21:49:24 GMT
Github user mengxr commented on a diff in the pull request:

    https://github.com/apache/spark/pull/2634#discussion_r18488302
  
    --- Diff: mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala ---
    @@ -17,429 +17,57 @@
     
     package org.apache.spark.mllib.clustering
     
    -import scala.collection.mutable.ArrayBuffer
     
    -import breeze.linalg.{DenseVector => BDV, Vector => BV, norm => breezeNorm}
    -
    -import org.apache.spark.annotation.Experimental
    -import org.apache.spark.Logging
    -import org.apache.spark.SparkContext._
    -import org.apache.spark.mllib.linalg.{Vector, Vectors}
    -import org.apache.spark.mllib.util.MLUtils
    +import org.apache.spark.mllib.base.{FP, PointOps}
    +import org.apache.spark.mllib.clustering.metrics.FastEuclideanOps
     import org.apache.spark.rdd.RDD
    -import org.apache.spark.storage.StorageLevel
    -import org.apache.spark.util.random.XORShiftRandom
    -
    -/**
    - * K-means clustering with support for multiple parallel runs and a k-means++ like initialization
    - * mode (the k-means|| algorithm by Bahmani et al). When multiple concurrent runs are
requested,
    - * they are executed together with joint passes over the data for efficiency.
    - *
    - * This is an iterative algorithm that will make multiple passes over the data, so any
RDDs given
    - * to it should be cached by the user.
    - */
    -class KMeans private (
    -    private var k: Int,
    -    private var maxIterations: Int,
    -    private var runs: Int,
    -    private var initializationMode: String,
    -    private var initializationSteps: Int,
    -    private var epsilon: Double) extends Serializable with Logging {
    -
    -  /**
    -   * Constructs a KMeans instance with default parameters: {k: 2, maxIterations: 20,
runs: 1,
    -   * initializationMode: "k-means||", initializationSteps: 5, epsilon: 1e-4}.
    -   */
    -  def this() = this(2, 20, 1, KMeans.K_MEANS_PARALLEL, 5, 1e-4)
    -
    -  /** Set the number of clusters to create (k). Default: 2. */
    -  def setK(k: Int): this.type = {
    --- End diff --
    
    This is a breaking change.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


Mime
View raw message