Mailing-List: contact reviews-help@spark.apache.org; run by ezmlm
Precedence: bulk
From: liancheng <git@git.apache.org>
To: reviews@spark.apache.org
Reply-To: reviews@spark.apache.org
References: <git-pr-5339-spark@git.apache.org>
In-Reply-To: <git-pr-5339-spark@git.apache.org>
Subject: [GitHub] spark pull request: [SPARK-6575][SQL] Converted Parquet
 Metastore ...
Content-Type: text/plain
Message-Id: <20150403013117.B80D9E2F4C@git1-us-west.apache.org>
Date: Fri,  3 Apr 2015 01:31:17 +0000 (UTC)

Github user liancheng commented on a diff in the pull request:

    https://github.com/apache/spark/pull/5339#discussion_r27711275
  
    --- Diff: sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala ---
    @@ -390,6 +392,116 @@ class ParquetDataSourceOnMetastoreSuite extends ParquetMetastoreSuiteBase {
     
         sql("DROP TABLE ms_convert")
       }
    +
    +  test("Caching converted data source Parquet Relations") {
    +    def checkCached(tableIdentifer: catalog.QualifiedTableName): Unit = {
    +      // Converted test_parquet should be cached.
    +      catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) match {
    +        case null => fail("Converted test_parquet should be cached in the cache.")
    +        case logical @ LogicalRelation(parquetRelation: ParquetRelation2) => // OK
    +        case other =>
    +          fail(
    +            "The cached test_parquet should be a Parquet Relation. " +
    +              s"However, $other is returned form the cache.")
    +      }
    +    }
    +
    +    sql("DROP TABLE IF EXISTS test_insert_parquet")
    +    sql("DROP TABLE IF EXISTS test_parquet_partitioned_cache_test")
    +
    +    sql(
    +      """
    +        |create table test_insert_parquet
    +        |(
    +        |  intField INT,
    +        |  stringField STRING
    +        |)
    +        |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
    +        |STORED AS
    +        |  INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
    +        |  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
    +      """.stripMargin)
    +
    +    var tableIdentifer = catalog.QualifiedTableName("default", "test_insert_parquet")
    +
    +    // First, make sure the converted test_parquet is not cached.
    +    assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === null)
    +    // Table lookup will make the table cached.
    +    table("test_insert_parquet")
    +    checkCached(tableIdentifer)
    +    // For insert into non-partitioned table, we will do the conversion,
    +    // so the converted test_insert_parquet should be cached.
    +    invalidateTable("test_insert_parquet")
    +    assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === null)
    +    sql(
    +      """
    +        |INSERT INTO TABLE test_insert_parquet
    +        |select a, b from jt
    +      """.stripMargin)
    +    checkCached(tableIdentifer)
    +    // Make sure we can read the data.
    +    checkAnswer(
    +      sql("select * from test_insert_parquet"),
    +      sql("select a, b from jt").collect())
    +    // Invalidate the cache.
    +    invalidateTable("test_insert_parquet")
    +    assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === null)
    +
    +    // Create a partitioned table.
    +    sql(
    +      """
    +        |create table test_parquet_partitioned_cache_test
    +        |(
    +        |  intField INT,
    +        |  stringField STRING
    +        |)
    +        |PARTITIONED BY (date string)
    +        |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
    +        |STORED AS
    +        |  INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
    +        |  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
    +      """.stripMargin)
    +
    +    tableIdentifer = catalog.QualifiedTableName("default", "test_parquet_partitioned_cache_test")
    +    assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === null)
    +    sql(
    +      """
    +        |INSERT INTO TABLE test_parquet_partitioned_cache_test
    +        |PARTITION (date='2015-04-01')
    +        |select a, b from jt
    +      """.stripMargin)
    +    // Right now, insert into a partitioned Parquet is not supported in data source Parquet.
    +    // So, we expect it is not cached.
    +    assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === null)
    +    conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "false")
    +    sql(
    +      """
    +        |INSERT INTO TABLE test_parquet_partitioned_cache_test
    +        |PARTITION (date='2015-04-02')
    +        |select a, b from jt
    +      """.stripMargin)
    +    assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === null)
    +    conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "true")
    --- End diff --
    
    This should be unnecessary since we are in the `ParquetDataSourceOnMetastoreSuite`.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org