spark-reviews mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From yhuai <...@git.apache.org>
Subject [GitHub] spark pull request #15996: [SPARK-18567][SQL] Simplify CreateDataSourceTable...
Date Fri, 23 Dec 2016 00:54:48 GMT
Github user yhuai commented on a diff in the pull request:

    https://github.com/apache/spark/pull/15996#discussion_r93720195
  
    --- Diff: sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala ---
    @@ -364,48 +366,162 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
           throw new AnalysisException("Cannot create hive serde table with saveAsTable API")
         }
     
    -    val tableExists = df.sparkSession.sessionState.catalog.tableExists(tableIdent)
    -
    -    (tableExists, mode) match {
    -      case (true, SaveMode.Ignore) =>
    -        // Do nothing
    -
    -      case (true, SaveMode.ErrorIfExists) =>
    -        throw new AnalysisException(s"Table $tableIdent already exists.")
    -
    -      case _ =>
    -        val existingTable = if (tableExists) {
    -          Some(df.sparkSession.sessionState.catalog.getTableMetadata(tableIdent))
    -        } else {
    -          None
    -        }
    -        val storage = if (tableExists) {
    -          existingTable.get.storage
    -        } else {
    -          DataSource.buildStorageFormatFromOptions(extraOptions.toMap)
    -        }
    -        val tableType = if (tableExists) {
    -          existingTable.get.tableType
    -        } else if (storage.locationUri.isDefined) {
    -          CatalogTableType.EXTERNAL
    -        } else {
    -          CatalogTableType.MANAGED
    +    val catalog = df.sparkSession.sessionState.catalog
    +    val db = tableIdent.database.getOrElse(catalog.getCurrentDatabase)
    +    val tableIdentWithDB = tableIdent.copy(database = Some(db))
    +    val tableName = tableIdentWithDB.unquotedString
    +
    +    catalog.getTableMetadataOption(tableIdentWithDB) match {
    +      // If the table already exists...
    +      case Some(existingTable) =>
    +        mode match {
    +          case SaveMode.Ignore => // Do nothing
    +
    +          case SaveMode.ErrorIfExists =>
    +            throw new AnalysisException(s"Table $tableName already exists. You can set
SaveMode " +
    +              "to SaveMode.Append to insert data into the table or set SaveMode to "
+
    +              "SaveMode.Overwrite to overwrite the existing data.")
    +
    +          case SaveMode.Append =>
    +            if (existingTable.tableType == CatalogTableType.VIEW) {
    +              throw new AnalysisException("Saving data into a view is not allowed.")
    +            }
    +
    +            if (existingTable.provider.get == DDLUtils.HIVE_PROVIDER) {
    +              throw new AnalysisException(s"Saving data in the Hive serde table $tableName
is " +
    +                "not supported yet. Please use the insertInto() API as an alternative.")
    +            }
    +
    +            // Check if the specified data source match the data source of the existing
table.
    +            val existingProvider = DataSource.lookupDataSource(existingTable.provider.get)
    +            val specifiedProvider = DataSource.lookupDataSource(source)
    +            // TODO: Check that options from the resolved relation match the relation
that we are
    +            // inserting into (i.e. using the same compression).
    +            if (existingProvider != specifiedProvider) {
    +              throw new AnalysisException(s"The format of the existing table $tableName
is " +
    +                s"`${existingProvider.getSimpleName}`. It doesn't match the specified
format " +
    +                s"`${specifiedProvider.getSimpleName}`.")
    +            }
    +
    +            if (df.schema.length != existingTable.schema.length) {
    +              throw new AnalysisException(
    +                s"The column number of the existing table $tableName" +
    +                  s"(${existingTable.schema.catalogString}) doesn't match the data schema"
+
    +                  s"(${df.schema.catalogString})")
    +            }
    +
    +            val resolver = df.sparkSession.sessionState.conf.resolver
    +            val tableCols = existingTable.schema.map(_.name)
    +
    +            // As we are inserting into an existing table, we should respect the existing
schema and
    +            // adjust the column order of the given dataframe according to it, or throw
exception
    +            // if the column names do not match.
    +            val adjustedColumns = tableCols.map { col =>
    +              df.queryExecution.analyzed.resolve(Seq(col), resolver).getOrElse {
    +                val inputColumns = df.schema.map(_.name).mkString(", ")
    +                throw new AnalysisException(
    +                  s"cannot resolve '$col' given input columns: [$inputColumns]")
    +              }
    +            }
    +
    +            // Check if the specified partition columns match the existing table.
    +            val specifiedPartCols = CatalogUtils.normalizePartCols(
    +              tableName, tableCols, partitioningColumns.getOrElse(Nil), resolver)
    +            if (specifiedPartCols != existingTable.partitionColumnNames) {
    +              val existingPartCols = existingTable.partitionColumnNames.mkString(", ")
    +              throw new AnalysisException(
    +                s"""
    +                   |Specified partitioning does not match that of the existing table
$tableName.
    +                   |Specified partition columns: [${specifiedPartCols.mkString(", ")}]
    +                   |Existing partition columns: [$existingPartCols]
    +                """.stripMargin)
    +            }
    +
    +            // Check if the specified bucketing match the existing table.
    +            val specifiedBucketSpec = getBucketSpec.map { bucketSpec =>
    +              CatalogUtils.normalizeBucketSpec(tableName, tableCols, bucketSpec, resolver)
    +            }
    +            if (specifiedBucketSpec != existingTable.bucketSpec) {
    +              val specifiedBucketString =
    +                specifiedBucketSpec.map(_.toString).getOrElse("not bucketed")
    +              val existingBucketString =
    +                existingTable.bucketSpec.map(_.toString).getOrElse("not bucketed")
    +              throw new AnalysisException(
    +                s"""
    +                   |Specified bucketing does not match that of the existing table $tableName.
    +                   |Specified bucketing: $specifiedBucketString
    +                   |Existing bucketing: $existingBucketString
    +                """.stripMargin)
    +            }
    +
    +            // Reorder the columns of the given dataframe to match the existing table.
    +            val adjustedDataFrame = df.select(adjustedColumns.map(new Column(_)): _*)
    +
    +            if (classOf[CreatableRelationProvider].isAssignableFrom(existingProvider))
{
    +              // For data source that implementing `CreatableRelationProvider`, we should
call its
    +              // `createRelation` to do the data appending. `CreatableRelationProvider`
may not
    +              // provide an `InsertableRelation`, so we can't use `DataFrameWriter.insertInto`
here.
    +              existingProvider.newInstance().asInstanceOf[CreatableRelationProvider].createRelation(
    +                sqlContext = df.sparkSession.sqlContext,
    +                mode = mode,
    +                parameters = new CaseInsensitiveMap(extraOptions.toMap),
    +                data = adjustedDataFrame)
    +            } else {
    +              adjustedDataFrame.write.insertInto(tableIdentWithDB)
    --- End diff --
    
    Before this change, we always go to `createRelation`, right?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


Mime
View raw message