spark-reviews mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jkbradley <...@git.apache.org>
Subject [GitHub] spark pull request #20829: [SPARK-23690][ML] Add handleinvalid to VectorAsse...
Date Tue, 20 Mar 2018 20:42:33 GMT
Github user jkbradley commented on a diff in the pull request:

    https://github.com/apache/spark/pull/20829#discussion_r175911314
  
    --- Diff: mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala ---
    @@ -136,34 +172,88 @@ class VectorAssembler @Since("1.4.0") (@Since("1.4.0") override
val uid: String)
     @Since("1.6.0")
     object VectorAssembler extends DefaultParamsReadable[VectorAssembler] {
     
    +  private[feature] val SKIP_INVALID: String = "skip"
    +  private[feature] val ERROR_INVALID: String = "error"
    +  private[feature] val KEEP_INVALID: String = "keep"
    +  private[feature] val supportedHandleInvalids: Array[String] =
    +    Array(SKIP_INVALID, ERROR_INVALID, KEEP_INVALID)
    +
    +
    +  private[feature] def getLengthsFromFirst(dataset: Dataset[_],
    +                                           columns: Seq[String]): Map[String, Int] =
{
    +    try {
    +      val first_row = dataset.toDF.select(columns.map(col): _*).first
    +      columns.zip(first_row.toSeq).map {
    +        case (c, x) => c -> x.asInstanceOf[Vector].size
    +      }.toMap
    +    } catch {
    +      case e: NullPointerException => throw new NullPointerException(
    +        "Saw null value on the first row: " + e.toString)
    +      case e: NoSuchElementException => throw new NoSuchElementException(
    +        "Cannot infer vector size from all empty DataFrame" + e.toString)
    +    }
    +  }
    +
    +  private[feature] def getLengths(dataset: Dataset[_], columns: Seq[String],
    +                                  handleInvalid: String) = {
    +    val group_sizes = columns.map { c =>
    +      c -> AttributeGroup.fromStructField(dataset.schema(c)).size
    +    }.toMap
    +    val missing_columns: Seq[String] = group_sizes.filter(_._2 == -1).keys.toSeq
    +    val first_sizes: Map[String, Int] = (missing_columns.nonEmpty, handleInvalid) match
{
    +      case (true, VectorAssembler.ERROR_INVALID) =>
    +        getLengthsFromFirst(dataset, missing_columns)
    +      case (true, VectorAssembler.SKIP_INVALID) =>
    +        getLengthsFromFirst(dataset.na.drop, missing_columns)
    --- End diff --
    
    This will drop Rows with NA values in extraneous columns.  I.e., even if the VectorAssembler
is only assembling columns A and B, if there is a NAN in column C, this will drop that row.
 Pass the list of columns you care about to the drop() method.


---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


Mime
View raw message