spark-reviews mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From dongjoon-hyun <...@git.apache.org>
Subject [GitHub] spark pull request #20265: [SPARK-21783][SQL] Turn on ORC filter push-down b...
Date Wed, 17 Jan 2018 07:34:49 GMT
Github user dongjoon-hyun commented on a diff in the pull request:

    https://github.com/apache/spark/pull/20265#discussion_r161973650
  
    --- Diff: sql/core/src/test/scala/org/apache/spark/sql/FilterPushdownBenchmark.scala ---
    @@ -0,0 +1,230 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *    http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.spark.sql
    +
    +import java.io.File
    +
    +import scala.util.{Random, Try}
    +
    +import org.apache.spark.SparkConf
    +import org.apache.spark.sql.functions.monotonically_increasing_id
    +import org.apache.spark.sql.internal.SQLConf
    +import org.apache.spark.util.{Benchmark, Utils}
    +
    +
    +/**
    + * Benchmark to measure read performance with Filter pushdown.
    + */
    +object FilterPushdownBenchmark {
    +  val conf = new SparkConf()
    +  conf.set("orc.compression", "snappy")
    +  conf.set("spark.sql.parquet.compression.codec", "snappy")
    +
    +  private val spark = SparkSession.builder()
    +    .master("local[1]")
    +    .appName("FilterPushdownBenchmark")
    +    .config(conf)
    +    .getOrCreate()
    +
    +  def withTempPath(f: File => Unit): Unit = {
    +    val path = Utils.createTempDir()
    +    path.delete()
    +    try f(path) finally Utils.deleteRecursively(path)
    +  }
    +
    +  def withTempTable(tableNames: String*)(f: => Unit): Unit = {
    +    try f finally tableNames.foreach(spark.catalog.dropTempView)
    +  }
    +
    +  def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = {
    +    val (keys, values) = pairs.unzip
    +    val currentValues = keys.map(key => Try(spark.conf.get(key)).toOption)
    +    (keys, values).zipped.foreach(spark.conf.set)
    +    try f finally {
    +      keys.zip(currentValues).foreach {
    +        case (key, Some(value)) => spark.conf.set(key, value)
    +        case (key, None) => spark.conf.unset(key)
    +      }
    +    }
    +  }
    +
    +  private def prepareTable(dir: File, numRows: Int, width: Int): Unit = {
    +    import spark.implicits._
    +    val selectExpr = (1 to width).map(i => s"CAST(value AS STRING) c$i")
    +    val df = spark.range(numRows).map(_ => Random.nextLong).selectExpr(selectExpr:
_*)
    +      .withColumn("id", monotonically_increasing_id())
    +
    +    val dirORC = dir.getCanonicalPath + "/orc"
    +    val dirParquet = dir.getCanonicalPath + "/parquet"
    +
    +    df.write.mode("overwrite").orc(dirORC)
    +    df.write.mode("overwrite").parquet(dirParquet)
    +
    +    spark.read.orc(dirORC).createOrReplaceTempView("orcTable")
    +    spark.read.parquet(dirParquet).createOrReplaceTempView("parquetTable")
    +  }
    +
    +  def filterPushDownBenchmark(values: Int, title: String, expr: String): Unit = {
    +    val benchmark = new Benchmark(title, values, minNumIters = 5)
    +
    +    Seq(false, true).foreach { pushDownEnabled =>
    +      val name = s"Parquet Vectorized ${if (pushDownEnabled) s"(Pushdown)" else ""}"
    +      benchmark.addCase(name) { _ =>
    +        withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> s"$pushDownEnabled")
{
    +          spark.sql(s"SELECT * FROM parquetTable WHERE $expr").collect()
    +        }
    +      }
    +    }
    +
    +    Seq(false, true).foreach { pushDownEnabled =>
    +      val name = s"Native ORC Vectorized ${if (pushDownEnabled) s"(Pushdown)" else ""}"
    +      benchmark.addCase(name) { _ =>
    +        withSQLConf(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> s"$pushDownEnabled")
{
    +          spark.sql(s"SELECT * FROM orcTable WHERE $expr").collect()
    +        }
    +      }
    +    }
    +
    +    /*
    +    Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2
    +    Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
    +
    +    Select 0 row (id IS NULL):              Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)
  Relative
    +    -----------------------------------------------------------------------------------------------
    +    Parquet Vectorized                            2091 / 2258          0.5        1993.9
      1.0X
    +    Parquet Vectorized (Pushdown)                   41 /   44         25.6          39.0
     51.1X
    +    Native ORC Vectorized                         1625 / 1648          0.6        1549.6
      1.3X
    +    Native ORC Vectorized (Pushdown)                45 /   47         23.5          42.5
     46.9X
    +
    +    Select 0 row (524288 < id < 524288):    Best/Avg Time(ms)    Rate(M/s)   Per
Row(ns)   Relative
    +    -----------------------------------------------------------------------------------------------
    +    Parquet Vectorized                            2202 / 2294          0.5        2099.7
      1.0X
    +    Parquet Vectorized (Pushdown)                  734 /  844          1.4         699.9
      3.0X
    +    Native ORC Vectorized                         1632 / 1659          0.6        1556.0
      1.3X
    +    Native ORC Vectorized (Pushdown)                94 /   98         11.2          89.6
     23.4X
    +
    +    Select 1 row (id = 524288):             Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)
  Relative
    +    -----------------------------------------------------------------------------------------------
    +    Parquet Vectorized                            2113 / 2160          0.5        2015.3
      1.0X
    +    Parquet Vectorized (Pushdown)                  711 /  790          1.5         677.7
      3.0X
    +    Native ORC Vectorized                         1612 / 1657          0.7        1537.2
      1.3X
    +    Native ORC Vectorized (Pushdown)                92 /   95         11.4          87.7
     23.0X
    +
    +    Select 1 row (id <=> 524288):           Best/Avg Time(ms)    Rate(M/s)   Per
Row(ns)   Relative
    +    -----------------------------------------------------------------------------------------------
    +    Parquet Vectorized                            2105 / 2149          0.5        2007.9
      1.0X
    +    Parquet Vectorized (Pushdown)                  712 /  794          1.5         679.2
      3.0X
    +    Native ORC Vectorized                         1619 / 1655          0.6        1543.7
      1.3X
    +    Native ORC Vectorized (Pushdown)                90 /   93         11.6          85.9
     23.4X
    +
    +    Select 1 row (524288 <= id <= 524288):  Best/Avg Time(ms)    Rate(M/s)   Per
Row(ns)   Relative
    +    -----------------------------------------------------------------------------------------------
    +    Parquet Vectorized                            2081 / 2120          0.5        1984.8
      1.0X
    +    Parquet Vectorized (Pushdown)                  700 /  793          1.5         667.5
      3.0X
    +    Native ORC Vectorized                         1618 / 1653          0.6        1542.7
      1.3X
    +    Native ORC Vectorized (Pushdown)                91 /   94         11.5          86.6
     22.9X
    +
    +    Select 1 row (524287 < id < 524289):    Best/Avg Time(ms)    Rate(M/s)   Per
Row(ns)   Relative
    +    -----------------------------------------------------------------------------------------------
    +    Parquet Vectorized                            2094 / 2127          0.5        1997.3
      1.0X
    +    Parquet Vectorized (Pushdown)                  714 /  792          1.5         680.8
      2.9X
    +    Native ORC Vectorized                         1621 / 1644          0.6        1546.3
      1.3X
    +    Native ORC Vectorized (Pushdown)                90 /   94         11.6          86.1
     23.2X
    +
    +    Select 10% rows (id < 104857):          Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)
  Relative
    +    -----------------------------------------------------------------------------------------------
    +    Parquet Vectorized                            2498 / 2591          0.4        2381.9
      1.0X
    +    Parquet Vectorized (Pushdown)                 1047 / 1082          1.0         998.2
      2.4X
    +    Native ORC Vectorized                         1986 / 2119          0.5        1893.8
      1.3X
    +    Native ORC Vectorized (Pushdown)               552 /  582          1.9         526.1
      4.5X
    +
    +    Select 50% rows (id < 524288):          Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)
  Relative
    +    -----------------------------------------------------------------------------------------------
    +    Parquet Vectorized                            4321 / 5021          0.2        4121.3
      1.0X
    +    Parquet Vectorized (Pushdown)                 3967 / 4183          0.3        3783.6
      1.1X
    +    Native ORC Vectorized                         4107 / 4565          0.3        3916.9
      1.1X
    +    Native ORC Vectorized (Pushdown)              2983 / 3861          0.4        2844.5
      1.4X
    +
    +    Select 90% rows (id < 943718):          Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)
  Relative
    +    -----------------------------------------------------------------------------------------------
    +    Parquet Vectorized                            6815 / 7287          0.2        6499.0
      1.0X
    +    Parquet Vectorized (Pushdown)                 6891 / 7220          0.2        6571.5
      1.0X
    +    Native ORC Vectorized                         7337 / 7565          0.1        6997.1
      0.9X
    +    Native ORC Vectorized (Pushdown)              7274 / 7523          0.1        6936.6
      0.9X
    +
    +    Select all rows (id IS NOT NULL):       Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)
  Relative
    +    -----------------------------------------------------------------------------------------------
    +    Parquet Vectorized                            7321 / 7380          0.1        6981.5
      1.0X
    +    Parquet Vectorized (Pushdown)                 7352 / 7398          0.1        7011.2
      1.0X
    +    Native ORC Vectorized                         7386 / 7660          0.1        7043.9
      1.0X
    +    Native ORC Vectorized (Pushdown)              7629 / 7705          0.1        7275.9
      1.0X
    +
    +    Select all rows (id > -1):              Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)
  Relative
    +    -----------------------------------------------------------------------------------------------
    +    Parquet Vectorized                            7125 / 7384          0.1        6795.2
      1.0X
    +    Parquet Vectorized (Pushdown)                 7334 / 7390          0.1        6994.3
      1.0X
    +    Native ORC Vectorized                         7517 / 7642          0.1        7168.7
      0.9X
    +    Native ORC Vectorized (Pushdown)              7323 / 7601          0.1        6983.7
      1.0X
    +
    +    Select all rows (id != -1):             Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)
  Relative
    +    -----------------------------------------------------------------------------------------------
    +    Parquet Vectorized                            7281 / 7850          0.1        6944.0
      1.0X
    +    Parquet Vectorized (Pushdown)                 7311 / 7939          0.1        6972.7
      1.0X
    +    Native ORC Vectorized                         7530 / 7748          0.1        7181.4
      1.0X
    +    Native ORC Vectorized (Pushdown)              7309 / 7667          0.1        6970.2
      1.0X
    +    */
    +    benchmark.run()
    +  }
    +
    +  def main(args: Array[String]): Unit = {
    +    val numRows = 1024 * 1024
    +    val width = 20
    --- End diff --
    
    No problem. I'll set to 5.


---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


Mime
View raw message