spark-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From dav...@apache.org
Subject spark git commit: [SPARK-13574] [SQL] Add benchmark to measure string dictionary decode.
Date Wed, 02 Mar 2016 23:03:26 GMT
Repository: spark
Updated Branches:
  refs/heads/master b5a59a0fe -> e2780ce82


[SPARK-13574] [SQL] Add benchmark to measure string dictionary decode.

## What changes were proposed in this pull request?

Also updated the other benchmarks when the default to use vectorized decode was flipped.

Author: Nong Li <nong@databricks.com>

Closes #11454 from nongli/benchmark.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e2780ce8
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e2780ce8
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e2780ce8

Branch: refs/heads/master
Commit: e2780ce8252ded93a695125c0a745d8b93193cca
Parents: b5a59a0
Author: Nong Li <nong@databricks.com>
Authored: Wed Mar 2 15:03:19 2016 -0800
Committer: Davies Liu <davies.liu@gmail.com>
Committed: Wed Mar 2 15:03:19 2016 -0800

----------------------------------------------------------------------
 .../parquet/ParquetReadBenchmark.scala          | 70 +++++++++++++++-----
 1 file changed, 52 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/e2780ce8/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadBenchmark.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadBenchmark.scala
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadBenchmark.scala
index 660f0f1..14dbdf3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadBenchmark.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadBenchmark.scala
@@ -37,6 +37,10 @@ object ParquetReadBenchmark {
   val sc = new SparkContext("local[1]", "test-sql-context", conf)
   val sqlContext = new SQLContext(sc)
 
+  // Set default configs. Individual cases will change them if necessary.
+  sqlContext.conf.setConfString(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key, "true")
+  sqlContext.conf.setConfString(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, "true")
+
   def withTempPath(f: File => Unit): Unit = {
     val path = Utils.createTempDir()
     path.delete()
@@ -72,7 +76,7 @@ object ParquetReadBenchmark {
             .write.parquet(dir.getCanonicalPath)
         sqlContext.read.parquet(dir.getCanonicalPath).registerTempTable("tempTable")
 
-        sqlBenchmark.addCase("SQL Parquet Reader") { iter =>
+        sqlBenchmark.addCase("SQL Parquet Vectorized") { iter =>
           sqlContext.sql("select sum(id) from tempTable").collect()
         }
 
@@ -82,8 +86,8 @@ object ParquetReadBenchmark {
           }
         }
 
-        sqlBenchmark.addCase("SQL Parquet Vectorized") { iter =>
-          withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true") {
+        sqlBenchmark.addCase("SQL Parquet Non-Vectorized") { iter =>
+          withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") {
             sqlContext.sql("select sum(id) from tempTable").collect()
           }
         }
@@ -152,9 +156,9 @@ object ParquetReadBenchmark {
         Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz
         SQL Single Int Column Scan:         Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)
  Relative
         -------------------------------------------------------------------------------------------
-        SQL Parquet Reader                       1042 / 1208         15.1          66.2 
     1.0X
-        SQL Parquet MR                           1544 / 1607         10.2          98.2 
     0.7X
-        SQL Parquet Vectorized                    674 /  739         23.3          42.9 
     1.5X
+        SQL Parquet Vectorized                    657 /  778         23.9          41.8 
     1.0X
+        SQL Parquet MR                           1606 / 1731          9.8         102.1 
     0.4X
+        SQL Parquet Non-Vectorized               1133 / 1216         13.9          72.1 
     0.6X
         */
         sqlBenchmark.run()
 
@@ -172,8 +176,6 @@ object ParquetReadBenchmark {
   }
 
   def intStringScanBenchmark(values: Int): Unit = {
-    val benchmark = new Benchmark("Int and String Scan", values)
-
     withTempPath { dir =>
       withTempTable("t1", "tempTable") {
         sqlContext.range(values).registerTempTable("t1")
@@ -183,7 +185,7 @@ object ParquetReadBenchmark {
 
         val benchmark = new Benchmark("Int and String Scan", values)
 
-        benchmark.addCase("SQL Parquet Reader") { iter =>
+        benchmark.addCase("SQL Parquet Vectorized") { iter =>
           sqlContext.sql("select sum(c1), sum(length(c2)) from tempTable").collect
         }
 
@@ -193,15 +195,14 @@ object ParquetReadBenchmark {
           }
         }
 
-        benchmark.addCase("SQL Parquet Vectorized") { iter =>
-          withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true") {
+        benchmark.addCase("SQL Parquet Non-vectorized") { iter =>
+          withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") {
             sqlContext.sql("select sum(c1), sum(length(c2)) from tempTable").collect
           }
         }
 
-
         val files = SpecificParquetRecordReaderBase.listDirectory(dir).toArray
-        benchmark.addCase("ParquetReader") { num =>
+        benchmark.addCase("ParquetReader Non-vectorized") { num =>
           var sum1 = 0L
           var sum2 = 0L
           files.map(_.asInstanceOf[String]).foreach { p =>
@@ -219,11 +220,43 @@ object ParquetReadBenchmark {
         /*
         Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz
         Int and String Scan:                Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)
  Relative
-        -------------------------------------------------------------------------------
-        SQL Parquet Reader                       1381 / 1679          7.6         131.7 
     1.0X
-        SQL Parquet MR                           2005 / 2177          5.2         191.2 
     0.7X
-        SQL Parquet Vectorized                    919 / 1044         11.4          87.6 
     1.5X
-        ParquetReader                            1035 / 1163         10.1          98.7 
     1.3X
+        -------------------------------------------------------------------------------------------
+        SQL Parquet Vectorized                   1025 / 1180         10.2          97.8 
     1.0X
+        SQL Parquet MR                           2157 / 2222          4.9         205.7 
     0.5X
+        SQL Parquet Non-vectorized               1450 / 1466          7.2         138.3 
     0.7X
+        ParquetReader Non-vectorized             1005 / 1022         10.4          95.9 
     1.0X
+        */
+        benchmark.run()
+      }
+    }
+  }
+
+  def stringDictionaryScanBenchmark(values: Int): Unit = {
+    withTempPath { dir =>
+      withTempTable("t1", "tempTable") {
+        sqlContext.range(values).registerTempTable("t1")
+        sqlContext.sql("select cast((id % 200) + 10000 as STRING) as c1 from t1")
+          .write.parquet(dir.getCanonicalPath)
+        sqlContext.read.parquet(dir.getCanonicalPath).registerTempTable("tempTable")
+
+        val benchmark = new Benchmark("String Dictionary", values)
+
+        benchmark.addCase("SQL Parquet Vectorized") { iter =>
+          sqlContext.sql("select sum(length(c1)) from tempTable").collect
+        }
+
+        benchmark.addCase("SQL Parquet MR") { iter =>
+          withSQLConf(SQLConf.PARQUET_UNSAFE_ROW_RECORD_READER_ENABLED.key -> "false")
{
+            sqlContext.sql("select sum(length(c1)) from tempTable").collect
+          }
+        }
+
+        /*
+        Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz
+        String Dictionary:                  Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)
  Relative
+        -------------------------------------------------------------------------------------------
+        SQL Parquet Vectorized                    578 /  593         18.1          55.1 
     1.0X
+        SQL Parquet MR                           1021 / 1032         10.3          97.4 
     0.6X
         */
         benchmark.run()
       }
@@ -233,5 +266,6 @@ object ParquetReadBenchmark {
   def main(args: Array[String]): Unit = {
     intScanBenchmark(1024 * 1024 * 15)
     intStringScanBenchmark(1024 * 1024 * 10)
+    stringDictionaryScanBenchmark(1024 * 1024 * 10)
   }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org


Mime
View raw message