From commits-return-33871-archive-asf-public=cust-asf.ponee.io@spark.apache.org Wed Oct 3 13:14:13 2018 Return-Path: X-Original-To: archive-asf-public@cust-asf.ponee.io Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by mx-eu-01.ponee.io (Postfix) with SMTP id 4431418065B for ; Wed, 3 Oct 2018 13:14:13 +0200 (CEST) Received: (qmail 20587 invoked by uid 500); 3 Oct 2018 11:14:12 -0000 Mailing-List: contact commits-help@spark.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Delivered-To: mailing list commits@spark.apache.org Received: (qmail 20578 invoked by uid 99); 3 Oct 2018 11:14:12 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 03 Oct 2018 11:14:12 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 499A8DFE95; Wed, 3 Oct 2018 11:14:12 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: dongjoon@apache.org To: commits@spark.apache.org Message-Id: <69b1ac046d324d188abd673642fe7aa2@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: spark git commit: [SPARK-25589][SQL][TEST] Add BloomFilterBenchmark Date: Wed, 3 Oct 2018 11:14:12 +0000 (UTC) Repository: spark Updated Branches: refs/heads/master 928d0739c -> 1a5d83bed [SPARK-25589][SQL][TEST] Add BloomFilterBenchmark ## What changes were proposed in this pull request? This PR aims to add `BloomFilterBenchmark`. For ORC data source, Apache Spark has been supporting for a long time. For Parquet data source, it's expected to be added with next Parquet release update. ## How was this patch tested? Manual. ```scala SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain org.apache.spark.sql.execution.benchmark.BloomFilterBenchmark" ``` Closes #22605 from dongjoon-hyun/SPARK-25589. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1a5d83be Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1a5d83be Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1a5d83be Branch: refs/heads/master Commit: 1a5d83bed8a6df62ef643b08453c7dd8feebf93a Parents: 928d073 Author: Dongjoon Hyun Authored: Wed Oct 3 04:14:07 2018 -0700 Committer: Dongjoon Hyun Committed: Wed Oct 3 04:14:07 2018 -0700 ---------------------------------------------------------------------- .../benchmarks/BloomFilterBenchmark-results.txt | 24 ++++++ .../benchmark/BloomFilterBenchmark.scala | 87 ++++++++++++++++++++ 2 files changed, 111 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/1a5d83be/sql/core/benchmarks/BloomFilterBenchmark-results.txt ---------------------------------------------------------------------- diff --git a/sql/core/benchmarks/BloomFilterBenchmark-results.txt b/sql/core/benchmarks/BloomFilterBenchmark-results.txt new file mode 100644 index 0000000..2eeb26c --- /dev/null +++ b/sql/core/benchmarks/BloomFilterBenchmark-results.txt @@ -0,0 +1,24 @@ +================================================================================================ +ORC Write +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Write 100M rows: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Without bloom filter 16765 / 17587 6.0 167.7 1.0X +With bloom filter 20060 / 20626 5.0 200.6 0.8X + + +================================================================================================ +ORC Read +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Read a row from 100M rows: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Without bloom filter 1857 / 1904 53.9 18.6 1.0X +With bloom filter 1399 / 1437 71.5 14.0 1.3X + + http://git-wip-us.apache.org/repos/asf/spark/blob/1a5d83be/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala new file mode 100644 index 0000000..2f3caca --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.benchmark + +import scala.util.Random + +import org.apache.spark.benchmark.Benchmark + +/** + * Benchmark to measure read performance with Bloom filters. + * + * Currently, only ORC supports bloom filters, we will add Parquet BM as soon as it becomes + * available. + * + * To run this benchmark: + * {{{ + * 1. without sbt: bin/spark-submit --class + * 2. build/sbt "sql/test:runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * Results will be written to "benchmarks/BloomFilterBenchmark-results.txt". + * }}} + */ +object BloomFilterBenchmark extends SqlBasedBenchmark { + import spark.implicits._ + + private val scaleFactor = 100 + private val N = scaleFactor * 1000 * 1000 + private val df = spark.range(N).map(_ => Random.nextInt) + + private def writeBenchmark(): Unit = { + withTempPath { dir => + val path = dir.getCanonicalPath + + runBenchmark(s"ORC Write") { + val benchmark = new Benchmark(s"Write ${scaleFactor}M rows", N, output = output) + benchmark.addCase("Without bloom filter") { _ => + df.write.mode("overwrite").orc(path + "/withoutBF") + } + benchmark.addCase("With bloom filter") { _ => + df.write.mode("overwrite") + .option("orc.bloom.filter.columns", "value").orc(path + "/withBF") + } + benchmark.run() + } + } + } + + private def readBenchmark(): Unit = { + withTempPath { dir => + val path = dir.getCanonicalPath + + df.write.orc(path + "/withoutBF") + df.write.option("orc.bloom.filter.columns", "value").orc(path + "/withBF") + + runBenchmark(s"ORC Read") { + val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) + benchmark.addCase("Without bloom filter") { _ => + spark.read.orc(path + "/withoutBF").where("value = 0").count + } + benchmark.addCase("With bloom filter") { _ => + spark.read.orc(path + "/withBF").where("value = 0").count + } + benchmark.run() + } + } + } + + override def runBenchmarkSuite(): Unit = { + writeBenchmark() + readBenchmark() + } +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org For additional commands, e-mail: commits-help@spark.apache.org