Mailing-List: contact reviews-help@spark.apache.org; run by ezmlm
Precedence: bulk
From: jkbradley <git@git.apache.org>
To: reviews@spark.apache.org
Reply-To: reviews@spark.apache.org
References: <git-pr-2451-spark@git.apache.org>
In-Reply-To: <git-pr-2451-spark@git.apache.org>
Subject: [GitHub] spark pull request: [WIP][SPARK-1486][MLlib] Multi Model
 Training ...
Content-Type: text/plain
Message-Id: <20140919201943.B43249CD5D0@tyr.zones.apache.org>
Date: Fri, 19 Sep 2014 20:19:43 +0000 (UTC)

Github user jkbradley commented on a diff in the pull request:

    https://github.com/apache/spark/pull/2451#discussion_r17806894
  
    --- Diff: mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala ---
    @@ -93,9 +1000,310 @@ object Matrices {
             require(dm.majorStride == dm.rows,
               "Do not support stride size different from the number of rows.")
             new DenseMatrix(dm.rows, dm.cols, dm.data)
    +      case sm: BSM[Double] =>
    +        new SparseMatrix(sm.rows, sm.cols, sm.colPtrs, sm.rowIndices, sm.data)
           case _ =>
             throw new UnsupportedOperationException(
               s"Do not support conversion from type ${breeze.getClass.getName}.")
         }
       }
    +
    +  /**
    +   * Generate a `DenseMatrix` consisting of zeros.
    +   * @param numRows number of rows of the matrix
    +   * @param numCols number of columns of the matrix
    +   * @return `Matrix` with size `numRows` x `numCols` and values of zeros
    +   */
    +  def zeros(numRows: Int, numCols: Int): Matrix = DenseMatrix.zeros(numRows, numCols)
    +
    +  /**
    +   * Generate a `DenseMatrix` consisting of ones.
    +   * @param numRows number of rows of the matrix
    +   * @param numCols number of columns of the matrix
    +   * @return `Matrix` with size `numRows` x `numCols` and values of ones
    +   */
    +  def ones(numRows: Int, numCols: Int): Matrix = DenseMatrix.ones(numRows, numCols)
    +
    +  /**
    +   * Generate an Identity Matrix in `DenseMatrix` format.
    +   * @param n number of rows and columns of the matrix
    +   * @return `Matrix` with size `n` x `n` and values of ones on the diagonal
    +   */
    +  def eye(n: Int): Matrix = DenseMatrix.eye(n)
    +
    +  /**
    +   * Generate an Identity Matrix in `SparseMatrix` format.
    +   * @param n number of rows and columns of the matrix
    +   * @return `Matrix` with size `n` x `n` and values of ones on the diagonal
    +   */
    +  def speye(n: Int): Matrix = SparseMatrix.speye(n)
    +
    +  /**
    +   * Generate a `DenseMatrix` consisting of i.i.d. uniform random numbers.
    +   * @param numRows number of rows of the matrix
    +   * @param numCols number of columns of the matrix
    +   * @return `Matrix` with size `numRows` x `numCols` and values in U(0, 1)
    +   */
    +  def rand(numRows: Int, numCols: Int): Matrix = DenseMatrix.rand(numRows, numCols)
    +
    +  /**
    +   * Generate a `DenseMatrix` consisting of i.i.d. gaussian random numbers.
    +   * @param numRows number of rows of the matrix
    +   * @param numCols number of columns of the matrix
    +   * @return `Matrix` with size `numRows` x `numCols` and values in N(0, 1)
    +   */
    +  def randn(numRows: Int, numCols: Int): Matrix = DenseMatrix.randn(numRows, numCols)
    +
    +  /**
    +   * Generate a `SparseMatrix` consisting of i.i.d. gaussian random numbers.
    +   * @param numRows number of rows of the matrix
    +   * @param numCols number of columns of the matrix
    +   * @param density the desired density for the matrix
    +   * @param seed the seed for the random generator
    +   * @return `Matrix` with size `numRows` x `numCols` and values in U(0, 1)
    +   */
    +  def sprand(
    +      numRows: Int,
    +      numCols: Int,
    +      density: Double,
    +      seed: Long = Utils.random.nextLong()): Matrix =
    +    SparseMatrix.sprand(numRows, numCols, density, seed)
    +
    +  /**
    +   * Generate a `SparseMatrix` consisting of i.i.d. gaussian random numbers.
    +   * @param numRows number of rows of the matrix
    +   * @param numCols number of columns of the matrix
    +   * @param density the desired density for the matrix
    +   * @param seed the seed for the random generator
    +   * @return `Matrix` with size `numRows` x `numCols` and values in N(0, 1)
    +   */
    +  def sprandn(
    +      numRows: Int,
    +      numCols: Int,
    +      density: Double,
    +      seed: Long = Utils.random.nextLong()): Matrix =
    +    SparseMatrix.sprandn(numRows, numCols, density, seed)
    +
    +  /**
    +   * Generate a diagonal matrix in `DenseMatrix` format from the supplied values. Use
    +   * [[org.apache.spark.mllib.linalg.SparseMatrix.diag()]] in order to generate the matrix in
    +   * `SparseMatrix` format.
    +   * @param vector a `Vector` that will form the values on the diagonal of the matrix
    +   * @return Square `Matrix` with size `values.length` x `values.length` and `values`
    +   *         on the diagonal
    +   */
    +  def diag(vector: Vector): Matrix = DenseMatrix.diag(vector)
    +
    +  /**
    +   * Horizontally concatenate a sequence of matrices. The returned matrix will be in the format
    +   * the matrices are supplied in. Supplying a mix of dense and sparse matrices is not supported.
    +   * @param matrices sequence of matrices
    +   * @return a single `Matrix` composed of the matrices that were horizontally concatenated
    +   */
    +  private[mllib] def horzCat(matrices: Seq[Matrix]): Matrix = {
    +    if (matrices.size == 1) {
    +      return matrices(0)
    +    }
    +    val numRows = matrices(0).numRows
    +    var rowsMatch = true
    +    var isDense = false
    +    var isSparse = false
    +    for (mat <- matrices) {
    +      if (numRows != mat.numRows) rowsMatch = false
    +      mat match {
    +        case sparse: SparseMatrix => isSparse = true
    +        case dense: DenseMatrix => isDense = true
    +      }
    +    }
    +    require(rowsMatch, "The number of rows of the matrices in this array, don't match!")
    +    var numCols = 0
    +    matrices.foreach(numCols += _.numCols)
    +    if (isSparse && !isDense) {
    +      val allColPtrs: Array[Int] = Array(0) ++ matrices.flatMap { mat =>
    +        val ptr = mat.asInstanceOf[SparseMatrix].colPtrs
    +        ptr.slice(1, ptr.length)
    +      }
    +      var counter = 0
    +      val adjustedPtrs = allColPtrs.map { p =>
    +        counter += p
    +        counter
    +      }
    +      new SparseMatrix(numRows, numCols, adjustedPtrs,
    +        matrices.flatMap(_.asInstanceOf[SparseMatrix].rowIndices).toArray,
    +        matrices.flatMap(_.asInstanceOf[SparseMatrix].values).toArray)
    +    } else if (!isSparse && !isDense) {
    +      throw new IllegalArgumentException("The supplied matrices are neither in SparseMatrix or" +
    +        " DenseMatrix format!")
    +    }else {
    +      new DenseMatrix(numRows, numCols, matrices.flatMap(_.toArray).toArray)
    +    }
    +  }
    +  // partitionMetaData correspond to the index of the partition and the max number of non-zeros
    +  // in that partition so that we can preallocate a memory efficient buffer
    +  private[mllib] def fromRDD(
    +      rows: RDD[(Double, Vector)],
    +      partitionMetaData: Array[(Int, Int)],
    +      batchSize : Int,
    +      buildSparseThreshold: Double,
    +      generateOnTheFly: Boolean = true): RDD[(DenseMatrix, Matrix)] = {
    +
    +    if (!generateOnTheFly){
    +      rows.mapPartitions { iter =>
    +        iter.grouped(batchSize)
    +      }.map(fromSeq(_, batchSize))
    +    }else {
    +      val numFeatures = rows.first()._2.size
    +
    +      rows.mapPartitionsWithIndex{ case (ind, iter) =>
    +        val findPartition = partitionMetaData.find(_._1 == ind)
    +        val matrixBuffer =
    +          if (findPartition.get._2 != -1) {
    +            val nnz = findPartition.get._2
    +            val density = nnz * 1.0 / (numFeatures * batchSize)
    +            if (density <= buildSparseThreshold) {
    +              (DenseMatrix.zeros(batchSize, 1), new SparseMatrix(numFeatures, batchSize,
    +                Array.fill(batchSize + 1)(0), Array.fill(nnz)(0), Array.fill(nnz)(0.0)))
    +            } else {
    +              (DenseMatrix.zeros(batchSize, 1), DenseMatrix.zeros(numFeatures, batchSize))
    +            }
    +          } else {
    +            (DenseMatrix.zeros(batchSize, 1), DenseMatrix.zeros(numFeatures, batchSize))
    +          }
    +        iter.grouped(batchSize).map(fromSeqIntoBuffer(_, matrixBuffer, batchSize)._2)
    +      }
    +    }
    +  }
    +
    +  // Collects data on the maximum number of non-zero elements in a partition for each
    +  // batch of matrices
    +  private[mllib] def getSparsityData(
    --- End diff --
    
    Should this and other methods which operate on labeled data be in a separate object from Matrices?  E.g., LabeledMatrices?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org