parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject parquet-cpp git commit: PARQUET-1094: Add benchmark for boolean Arrow column I/O
Date Sun, 17 Sep 2017 17:55:46 GMT
Repository: parquet-cpp
Updated Branches:
  refs/heads/master de35f8efb -> 18ca3922e


PARQUET-1094: Add benchmark for boolean Arrow column I/O

Author: Uwe L. Korn <uwe@apache.org>

Closes #391 from xhochy/PARQUET-1094 and squashes the following commits:

089bb3c [Uwe L. Korn] PARQUET-1094: Add benchmark for boolean Arrow column I/O


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/18ca3922
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/18ca3922
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/18ca3922

Branch: refs/heads/master
Commit: 18ca3922e688a3a730d693ff8f2cfbfd65da8c46
Parents: de35f8e
Author: Uwe L. Korn <uwe@apache.org>
Authored: Sun Sep 17 13:55:43 2017 -0400
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Sun Sep 17 13:55:43 2017 -0400

----------------------------------------------------------------------
 .../arrow/arrow-reader-writer-benchmark.cc      | 49 +++++++++++++++++---
 1 file changed, 43 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/18ca3922/src/parquet/arrow/arrow-reader-writer-benchmark.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/arrow-reader-writer-benchmark.cc b/src/parquet/arrow/arrow-reader-writer-benchmark.cc
index 84a6fb3..e899e10 100644
--- a/src/parquet/arrow/arrow-reader-writer-benchmark.cc
+++ b/src/parquet/arrow/arrow-reader-writer-benchmark.cc
@@ -27,6 +27,7 @@
 
 #include "arrow/api.h"
 
+using arrow::BooleanBuilder;
 using arrow::NumericBuilder;
 
 #define ABORT_NOT_OK(s)                  \
@@ -66,6 +67,11 @@ struct benchmark_traits<DoubleType> {
   using arrow_type = ::arrow::DoubleType;
 };
 
+template <>
+struct benchmark_traits<BooleanType> {
+  using arrow_type = ::arrow::BooleanType;
+};
+
 template <typename ParquetType>
 using ArrowType = typename benchmark_traits<ParquetType>::arrow_type;
 
@@ -86,11 +92,11 @@ void SetBytesProcessed(::benchmark::State& state) {
   state.SetBytesProcessed(bytes_processed);
 }
 
-template <bool nullable, typename ParquetType>
+template <typename ParquetType>
 std::shared_ptr<::arrow::Table> TableFromVector(
-    const std::vector<typename ParquetType::c_type>& vec) {
+    const std::vector<typename ParquetType::c_type>& vec, bool nullable) {
   ::arrow::TypePtr type = std::make_shared<ArrowType<ParquetType>>();
-  NumericBuilder<ArrowType<ParquetType>> builder(type, ::arrow::default_memory_pool());
+  NumericBuilder<ArrowType<ParquetType>> builder;
   if (nullable) {
     std::vector<uint8_t> valid_bytes(BENCHMARK_SIZE, 0);
     int n = {0};
@@ -101,7 +107,32 @@ std::shared_ptr<::arrow::Table> TableFromVector(
   }
   std::shared_ptr<::arrow::Array> array;
   ABORT_NOT_OK(builder.Finish(&array));
-  auto field = std::make_shared<::arrow::Field>("column", type, nullable);
+
+  auto field = ::arrow::field("column", type, nullable);
+  auto schema = std::make_shared<::arrow::Schema>(
+      std::vector<std::shared_ptr<::arrow::Field>>({field}));
+  auto column = std::make_shared<::arrow::Column>(field, array);
+  return std::make_shared<::arrow::Table>(
+      schema, std::vector<std::shared_ptr<::arrow::Column>>({column}));
+}
+
+template <>
+std::shared_ptr<::arrow::Table> TableFromVector<BooleanType>(const std::vector<bool>&
vec,
+                                                             bool nullable) {
+  BooleanBuilder builder;
+  if (nullable) {
+    std::vector<bool> valid_bytes(BENCHMARK_SIZE, 0);
+    int n = {0};
+    std::generate(valid_bytes.begin(), valid_bytes.end(),
+                  [&n] { return (n++ % 2) != 0; });
+    ABORT_NOT_OK(builder.Append(vec, valid_bytes));
+  } else {
+    ABORT_NOT_OK(builder.Append(vec));
+  }
+  std::shared_ptr<::arrow::Array> array;
+  ABORT_NOT_OK(builder.Finish(&array));
+
+  auto field = ::arrow::field("column", ::arrow::boolean(), nullable);
   auto schema = std::make_shared<::arrow::Schema>(
       std::vector<std::shared_ptr<::arrow::Field>>({field}));
   auto column = std::make_shared<::arrow::Column>(field, array);
@@ -113,7 +144,7 @@ template <bool nullable, typename ParquetType>
 static void BM_WriteColumn(::benchmark::State& state) {
   format::ColumnChunk thrift_metadata;
   std::vector<typename ParquetType::c_type> values(BENCHMARK_SIZE, 128);
-  std::shared_ptr<::arrow::Table> table = TableFromVector<nullable, ParquetType>(values);
+  std::shared_ptr<::arrow::Table> table = TableFromVector<ParquetType>(values,
nullable);
 
   while (state.KeepRunning()) {
     auto output = std::make_shared<InMemoryOutputStream>();
@@ -132,10 +163,13 @@ BENCHMARK_TEMPLATE2(BM_WriteColumn, true, Int64Type);
 BENCHMARK_TEMPLATE2(BM_WriteColumn, false, DoubleType);
 BENCHMARK_TEMPLATE2(BM_WriteColumn, true, DoubleType);
 
+BENCHMARK_TEMPLATE2(BM_WriteColumn, false, BooleanType);
+BENCHMARK_TEMPLATE2(BM_WriteColumn, true, BooleanType);
+
 template <bool nullable, typename ParquetType>
 static void BM_ReadColumn(::benchmark::State& state) {
   std::vector<typename ParquetType::c_type> values(BENCHMARK_SIZE, 128);
-  std::shared_ptr<::arrow::Table> table = TableFromVector<nullable, ParquetType>(values);
+  std::shared_ptr<::arrow::Table> table = TableFromVector<ParquetType>(values,
nullable);
   auto output = std::make_shared<InMemoryOutputStream>();
   ABORT_NOT_OK(
       WriteTable(*table, ::arrow::default_memory_pool(), output, BENCHMARK_SIZE));
@@ -160,6 +194,9 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int64Type);
 BENCHMARK_TEMPLATE2(BM_ReadColumn, false, DoubleType);
 BENCHMARK_TEMPLATE2(BM_ReadColumn, true, DoubleType);
 
+BENCHMARK_TEMPLATE2(BM_ReadColumn, false, BooleanType);
+BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType);
+
 }  // namespace benchmark
 
 }  // namespace parquet


Mime
View raw message