spark-reviews mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From gatorsmile <...@git.apache.org>
Subject [GitHub] spark pull request #20087: [SPARK-21786][SQL] The 'spark.sql.parquet.compres...
Date Thu, 18 Jan 2018 05:33:53 GMT
Github user gatorsmile commented on a diff in the pull request:

    https://github.com/apache/spark/pull/20087#discussion_r162252901
  
    --- Diff: sql/hive/src/test/scala/org/apache/spark/sql/hive/CompressionCodecSuite.scala
---
    @@ -0,0 +1,321 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *    http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.spark.sql.hive
    +
    +import java.io.File
    +
    +import scala.collection.JavaConverters._
    +
    +import org.apache.hadoop.fs.Path
    +import org.apache.orc.OrcConf.COMPRESS
    +import org.apache.parquet.hadoop.ParquetOutputFormat
    +import org.scalatest.BeforeAndAfterAll
    +
    +import org.apache.spark.sql.execution.datasources.orc.OrcOptions
    +import org.apache.spark.sql.execution.datasources.parquet.{ParquetOptions, ParquetTest}
    +import org.apache.spark.sql.hive.orc.OrcFileOperator
    +import org.apache.spark.sql.hive.test.TestHiveSingleton
    +import org.apache.spark.sql.internal.SQLConf
    +
    +class CompressionCodecSuite extends TestHiveSingleton with ParquetTest with BeforeAndAfterAll
{
    +  import spark.implicits._
    +
    +  override def beforeAll(): Unit = {
    +    super.beforeAll()
    +    (0 until maxRecordNum).toDF("a").createOrReplaceTempView("table_source")
    +  }
    +
    +  override def afterAll(): Unit = {
    +    try {
    +      spark.catalog.dropTempView("table_source")
    +    } finally {
    +      super.afterAll()
    +    }
    +  }
    +
    +  private val maxRecordNum = 500
    +
    +  private def getConvertMetastoreConfName(format: String): String = format.toLowerCase
match {
    +    case "parquet" => HiveUtils.CONVERT_METASTORE_PARQUET.key
    +    case "orc" => HiveUtils.CONVERT_METASTORE_ORC.key
    +  }
    +
    +  private def getSparkCompressionConfName(format: String): String = format.toLowerCase
match {
    +    case "parquet" => SQLConf.PARQUET_COMPRESSION.key
    +    case "orc" => SQLConf.ORC_COMPRESSION.key
    +  }
    +
    +  private def getHiveCompressPropName(format: String): String = format.toLowerCase match
{
    +    case "parquet" => ParquetOutputFormat.COMPRESSION
    +    case "orc" => COMPRESS.getAttribute
    +  }
    +
    +  private def normalizeCodecName(format: String, name: String): String = {
    +    format.toLowerCase match {
    +      case "parquet" => ParquetOptions.shortParquetCompressionCodecNames(name).name()
    +      case "orc" => OrcOptions.shortOrcCompressionCodecNames(name)
    +    }
    +  }
    +
    +  private def getTableCompressionCodec(path: String, format: String): Seq[String] = {
    +    val hadoopConf = spark.sessionState.newHadoopConf()
    +    val codecs = format.toLowerCase match {
    +      case "parquet" => for {
    +        footer <- readAllFootersWithoutSummaryFiles(new Path(path), hadoopConf)
    +        block <- footer.getParquetMetadata.getBlocks.asScala
    +        column <- block.getColumns.asScala
    +      } yield column.getCodec.name()
    +      case "orc" => new File(path).listFiles().filter{ file =>
    +        file.isFile && !file.getName.endsWith(".crc") && file.getName
!= "_SUCCESS"
    +      }.map { orcFile =>
    +        OrcFileOperator.getFileReader(orcFile.toPath.toString).get.getCompression.toString
    +      }.toSeq
    +    }
    +    codecs.distinct
    +  }
    +
    +  private def createTable(
    +      rootDir: File,
    +      tableName: String,
    +      isPartitioned: Boolean,
    +      format: String,
    +      compressionCodec: Option[String]): Unit = {
    +    val tblProperties = compressionCodec match {
    +      case Some(prop) => s"TBLPROPERTIES('${getHiveCompressPropName(format)}'='$prop')"
    +      case _ => ""
    +    }
    +    val partitionCreate = if (isPartitioned) "PARTITIONED BY (p string)" else ""
    +    sql(
    +      s"""
    +        |CREATE TABLE $tableName(a int)
    +        |$partitionCreate
    +        |STORED AS $format
    +        |LOCATION '${rootDir.toURI.toString.stripSuffix("/")}/$tableName'
    +        |$tblProperties
    +      """.stripMargin)
    +  }
    +
    +  private def writeDataToTable(
    +      tableName: String,
    +      partition: Option[String]): Unit = {
    +    val partitionInsert = partition.map(p => s"partition (p='$p')").mkString
    +    sql(
    +      s"""
    +        |INSERT INTO TABLE $tableName
    +        |$partitionInsert
    +        |SELECT * FROM table_source
    +      """.stripMargin)
    +  }
    +
    +  private def getTableSize(path: String): Long = {
    +    val dir = new File(path)
    +    val files = dir.listFiles().filter(_.getName.startsWith("part-"))
    +    files.map(_.length()).sum
    +  }
    +
    +  private def getTablePartitionPath(dir: File, tableName: String, partition: Option[String])
= {
    +    val partitionPath = partition.map(p => s"p=$p").mkString
    +    s"${dir.getPath.stripSuffix("/")}/$tableName/$partitionPath"
    +  }
    +
    +  private def getUncompressedDataSizeByFormat(
    +      format: String, isPartitioned: Boolean): Long = {
    +    var totalSize = 0L
    +    val tableName = s"tbl_$format"
    +    val codecName = normalizeCodecName(format, "uncompressed")
    +    withSQLConf(getSparkCompressionConfName(format) -> codecName) {
    +      withTempDir { tmpDir =>
    +        withTable(tableName) {
    +          createTable(tmpDir, tableName, isPartitioned, format, Option(codecName))
    +          val partition = if (isPartitioned) Some("test") else None
    +          writeDataToTable(tableName, partition)
    +          val path = getTablePartitionPath(tmpDir, tableName, partition)
    +          totalSize = getTableSize(path)
    +        }
    +      }
    +    }
    +    assert(totalSize > 0L)
    +    totalSize
    +  }
    +
    +  private def checkCompressionCodecForTable(
    +      format: String,
    +      isPartitioned: Boolean,
    +      compressionCodec: Option[String])
    +      (assertion: (String, Long) => Unit): Unit = {
    +    val tableName = s"tbl_$format$isPartitioned"
    +    withTempDir { tmpDir =>
    +      withTable(tableName) {
    +        createTable(tmpDir, tableName, isPartitioned, format, compressionCodec)
    +        val partition = if (isPartitioned) Some("test") else None
    +        writeDataToTable(tableName, partition)
    +        val path = getTablePartitionPath(tmpDir, tableName, partition)
    +        val relCompressionCodecs = getTableCompressionCodec(path, format)
    +        assert(relCompressionCodecs.length == 1)
    +        val tableSize = getTableSize(path)
    +        assertion(relCompressionCodecs.head, tableSize)
    +      }
    +    }
    +  }
    +
    +  private def checkTableCompressionCodecForCodecs(
    +      format: String,
    +      isPartitioned: Boolean,
    +      convertMetastore: Boolean,
    +      compressionCodecs: List[String],
    +      tableCompressionCodecs: List[String])
    +      (assertionCompressionCodec: (Option[String], String, String, Long) => Unit):
Unit = {
    +    withSQLConf(getConvertMetastoreConfName(format) -> convertMetastore.toString)
{
    +      tableCompressionCodecs.foreach { tableCompression =>
    +        compressionCodecs.foreach { sessionCompressionCodec =>
    +          withSQLConf(getSparkCompressionConfName(format) -> sessionCompressionCodec)
{
    +            // 'tableCompression = null' means no table-level compression
    +            val compression = Option(tableCompression)
    +            checkCompressionCodecForTable(format, isPartitioned, compression) {
    +              case (realCompressionCodec, tableSize) => assertionCompressionCodec(compression,
    +                sessionCompressionCodec, realCompressionCodec, tableSize)
    +            }
    +          }
    +        }
    +      }
    +    }
    +  }
    +
    +  // When the amount of data is small, compressed data size may be larger than uncompressed
one,
    +  // so we just check the difference when compressionCodec is not NONE or UNCOMPRESSED.
    +  private def checkTableSize(
    +      format: String,
    +      compressionCodec: String,
    +      isPartitioned: Boolean,
    +      convertMetastore: Boolean,
    +      tableSize: Long): Boolean = {
    +    format match {
    +      case "parquet" =>
    +        val uncompressedSize = if (!convertMetastore || isPartitioned) {
    +          getUncompressedDataSizeByFormat(format, isPartitioned = true)
    +        } else {
    +          getUncompressedDataSizeByFormat(format, isPartitioned = false)
    +        }
    +
    +        if (compressionCodec == "UNCOMPRESSED") {
    +          tableSize == uncompressedSize
    +        } else {
    +          tableSize != uncompressedSize
    +        }
    +      case "orc" =>
    +        val uncompressedSize = if (!convertMetastore || isPartitioned) {
    +          getUncompressedDataSizeByFormat(format, isPartitioned = true)
    +        } else {
    +          getUncompressedDataSizeByFormat(format, isPartitioned = false)
    +        }
    +
    +        if (compressionCodec == "NONE") {
    +          tableSize == uncompressedSize
    +        } else {
    +          tableSize != uncompressedSize
    +        }
    +      case _ => false
    +    }
    +  }
    +
    +  def checkForTableWithCompressProp(format: String, compressCodecs: List[String]): Unit
= {
    +    Seq(true, false).foreach { isPartitioned =>
    +      Seq(true, false).foreach { convertMetastore =>
    +        checkTableCompressionCodecForCodecs(
    +          format,
    +          isPartitioned,
    +          convertMetastore,
    +          compressionCodecs = compressCodecs,
    +          tableCompressionCodecs = compressCodecs) {
    --- End diff --
    
    Also add another scenario. 
    ```
    compressionCodecs = Nil,
    tableCompressionCodecs = compressCodecs 
    ```


---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


Mime
View raw message