spark-reviews mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From falaki <...@git.apache.org>
Subject [GitHub] spark pull request #14745: [SPARK-16896][SQL] Handle duplicated field names ...
Date Mon, 22 Aug 2016 21:53:23 GMT
Github user falaki commented on a diff in the pull request:

    https://github.com/apache/spark/pull/14745#discussion_r75766440
  
    --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
---
    @@ -57,28 +57,45 @@ class CSVFileFormat extends TextBasedFileFormat with DataSourceRegister
{
         val rdd = baseRdd(sparkSession, csvOptions, paths)
         val firstLine = findFirstLine(csvOptions, rdd)
         val firstRow = new CsvReader(csvOptions).parseLine(firstLine)
    -
    -    val header = if (csvOptions.headerFlag) {
    -      firstRow.zipWithIndex.map { case (value, index) =>
    -        if (value == null || value.isEmpty || value == csvOptions.nullValue) s"_c$index"
else value
    -      }
    -    } else {
    -      firstRow.zipWithIndex.map { case (value, index) => s"_c$index" }
    -    }
    +    val header = makeSafeHeader(firstRow, csvOptions)
     
         val parsedRdd = tokenRdd(sparkSession, csvOptions, header, paths)
         val schema = if (csvOptions.inferSchemaFlag) {
           CSVInferSchema.infer(parsedRdd, header, csvOptions)
         } else {
           // By default fields are assumed to be StringType
           val schemaFields = header.map { fieldName =>
    -        StructField(fieldName.toString, StringType, nullable = true)
    +        StructField(fieldName, StringType, nullable = true)
           }
           StructType(schemaFields)
         }
         Some(schema)
       }
     
    +  /**
    +   * Generates a header from the given row which is null-safe and duplicates-safe.
    +   */
    +  private def makeSafeHeader(row: Array[String], options: CSVOptions): Array[String]
= {
    --- End diff --
    
    I suggest putting this function in utils and writing a separate unit test for it.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


Mime
View raw message