arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject [arrow] 07/09: ARROW-1844: [C++] Add initial Unique benchmarks for int64, variable-length strings
Date Fri, 01 Dec 2017 16:50:47 GMT
This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit bbbbbfb1ecd29a4d435f942cec5f01226368c71c
Author: Wes McKinney <wes.mckinney@twosigma.com>
AuthorDate: Wed Nov 29 10:43:20 2017 -0500

    ARROW-1844: [C++] Add initial Unique benchmarks for int64, variable-length strings
    
    I also fixed a bug this surfaced in the hash table resize (unit test coverage was not
adequate)
    
    Now we have
    
    ```
    $ ./release/compute-benchmark
    Run on (8 X 4200.16 MHz CPU s)
    2017-11-28 18:33:53
    Benchmark                                                           Time           CPU
Iterations
    -------------------------------------------------------------------------------------------------
    BM_BuildDictionary/min_time:1.000                                1352 us       1352 us
      1038   2.88639GB/s
    BM_BuildStringDictionary/min_time:1.000                          3994 us       3994 us
       351   75.5809MB/s
    BM_UniqueInt64NoNulls/16M/50/min_time:1.000/real_time           35814 us      35816 us
        39   3.49023GB/s
    BM_UniqueInt64NoNulls/16M/1024/min_time:1.000/real_time        119656 us     119660 us
        12   1069.73MB/s
    BM_UniqueInt64NoNulls/16M/10k/min_time:1.000/real_time         174924 us     174930 us
         8   731.747MB/s
    BM_UniqueInt64NoNulls/16M/1024k/min_time:1.000/real_time       448425 us     448440 us
         3   285.443MB/s
    BM_UniqueInt64WithNulls/16M/50/min_time:1.000/real_time         49511 us      49513 us
        29   2.52468GB/s
    BM_UniqueInt64WithNulls/16M/1024/min_time:1.000/real_time      134519 us     134523 us
        10   951.541MB/s
    BM_UniqueInt64WithNulls/16M/10k/min_time:1.000/real_time       191331 us     191336 us
         7   668.999MB/s
    BM_UniqueInt64WithNulls/16M/1024k/min_time:1.000/real_time     533597 us     533613 us
         3   239.882MB/s
    BM_UniqueString10bytes/16M/50/min_time:1.000/real_time         150731 us     150736 us
         9    1061.5MB/s
    BM_UniqueString10bytes/16M/1024/min_time:1.000/real_time       256929 us     256938 us
         5   622.739MB/s
    BM_UniqueString10bytes/16M/10k/min_time:1.000/real_time        414412 us     414426 us
         3    386.09MB/s
    BM_UniqueString10bytes/16M/1024k/min_time:1.000/real_time     1744253 us    1744308 us
         1   91.7298MB/s
    BM_UniqueString100bytes/16M/50/min_time:1.000/real_time        563890 us     563909 us
         2   2.77093GB/s
    BM_UniqueString100bytes/16M/1024/min_time:1.000/real_time      704695 us     704720 us
         2   2.21727GB/s
    BM_UniqueString100bytes/16M/10k/min_time:1.000/real_time       995685 us     995721 us
         2   1.56927GB/s
    BM_UniqueString100bytes/16M/1024k/min_time:1.000/real_time    3584108 us    3584230 us
         1   446.415MB/s
    ```
    
    We can also refactor the hash table implementations without worrying too much about whether
we're making things slower
    
    Author: Wes McKinney <wes.mckinney@twosigma.com>
    
    Closes #1370 from wesm/ARROW-1844 and squashes the following commits:
    
    638f1a11 [Wes McKinney] Decrease resize load factor to 0.5
    2885c645 [Wes McKinney] Multiply bytes processed by state.iterations()
    f7b36194 [Wes McKinney] Add initial Unique benchmarks for int64, strings
---
 cpp/src/arrow/compute/compute-benchmark.cc | 127 ++++++++++++++++++++++++++++-
 cpp/src/arrow/compute/compute-test.cc      |   4 +-
 cpp/src/arrow/compute/kernels/hash.cc      |   4 +-
 3 files changed, 129 insertions(+), 6 deletions(-)

diff --git a/cpp/src/arrow/compute/compute-benchmark.cc b/cpp/src/arrow/compute/compute-benchmark.cc
index 974fffc..aa7d899 100644
--- a/cpp/src/arrow/compute/compute-benchmark.cc
+++ b/cpp/src/arrow/compute/compute-benchmark.cc
@@ -81,8 +81,131 @@ static void BM_BuildStringDictionary(
   state.SetBytesProcessed(state.iterations() * total_bytes);
 }
 
-BENCHMARK(BM_BuildDictionary)->Repetitions(3)->Unit(benchmark::kMicrosecond);
-BENCHMARK(BM_BuildStringDictionary)->Repetitions(3)->Unit(benchmark::kMicrosecond);
+template <typename Type>
+struct HashParams {
+  using T = typename Type::c_type;
+
+  double null_percent;
+
+  void GenerateTestData(const int64_t length, const int64_t num_unique,
+                        std::shared_ptr<Array>* arr) const {
+    std::vector<int64_t> draws;
+    std::vector<T> values;
+    std::vector<bool> is_valid;
+    test::randint<int64_t>(length, 0, num_unique, &draws);
+    for (int64_t draw : draws) {
+      values.push_back(draw);
+    }
+
+    if (this->null_percent > 0) {
+      test::random_is_valid(length, this->null_percent, &is_valid);
+      ArrayFromVector<Type, T>(is_valid, values, arr);
+    } else {
+      ArrayFromVector<Type, T>(values, arr);
+    }
+  }
+
+  int64_t GetBytesProcessed(int64_t length) const { return length * sizeof(T); }
+};
+
+template <>
+struct HashParams<StringType> {
+  double null_percent;
+  int32_t byte_width;
+  void GenerateTestData(const int64_t length, const int64_t num_unique,
+                        std::shared_ptr<Array>* arr) const {
+    std::vector<int64_t> draws;
+    test::randint<int64_t>(length, 0, num_unique, &draws);
+
+    const int64_t total_bytes = this->byte_width * num_unique;
+    std::vector<uint8_t> uniques(total_bytes);
+    const uint32_t seed = 0;
+    test::random_bytes(total_bytes, seed, uniques.data());
+
+    std::vector<bool> is_valid;
+    if (this->null_percent > 0) {
+      test::random_is_valid(length, this->null_percent, &is_valid);
+    }
+
+    StringBuilder builder;
+    for (int64_t i = 0; i < length; ++i) {
+      if (this->null_percent == 0 || is_valid[i]) {
+        ABORT_NOT_OK(builder.Append(uniques.data() + this->byte_width * draws[i],
+                                    this->byte_width));
+      } else {
+        ABORT_NOT_OK(builder.AppendNull());
+      }
+    }
+    ABORT_NOT_OK(builder.Finish(arr));
+  }
+
+  int64_t GetBytesProcessed(int64_t length) const { return length * byte_width; }
+};
+
+template <typename ParamType>
+void BenchUnique(benchmark::State& state, const ParamType& params, int64_t length,
+                 int64_t num_unique) {
+  std::shared_ptr<Array> arr;
+  params.GenerateTestData(length, num_unique, &arr);
+
+  FunctionContext ctx;
+  while (state.KeepRunning()) {
+    std::shared_ptr<Array> out;
+    ABORT_NOT_OK(Unique(&ctx, Datum(arr), &out));
+  }
+  state.SetBytesProcessed(state.iterations() * params.GetBytesProcessed(length));
+}
+
+template <typename ParamType>
+void BenchDictionaryEncode(benchmark::State& state, const ParamType& params,
+                           int64_t length, int64_t num_unique) {
+  std::shared_ptr<Array> arr;
+  params.GenerateTestData(length, num_unique, &arr);
+
+  FunctionContext ctx;
+  while (state.KeepRunning()) {
+    Datum out;
+    ABORT_NOT_OK(DictionaryEncode(&ctx, Datum(arr), &out));
+  }
+  state.SetBytesProcessed(state.iterations() * params.GetBytesProcessed(length));
+}
+
+static void BM_UniqueInt64NoNulls(benchmark::State& state) {
+  BenchUnique(state, HashParams<Int64Type>{0}, state.range(0), state.range(1));
+}
+
+static void BM_UniqueInt64WithNulls(benchmark::State& state) {
+  BenchUnique(state, HashParams<Int64Type>{0.05}, state.range(0), state.range(1));
+}
+
+static void BM_UniqueString10bytes(benchmark::State& state) {
+  // Byte strings with 10 bytes each
+  BenchUnique(state, HashParams<StringType>{0.05, 10}, state.range(0), state.range(1));
+}
+
+static void BM_UniqueString100bytes(benchmark::State& state) {
+  // Byte strings with 100 bytes each
+  BenchUnique(state, HashParams<StringType>{0.05, 100}, state.range(0), state.range(1));
+}
+
+BENCHMARK(BM_BuildDictionary)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_BuildStringDictionary)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
+
+constexpr int64_t kHashBenchmarkLength = 1 << 24;
+
+#define ADD_HASH_ARGS(WHAT)                        \
+  WHAT->Args({kHashBenchmarkLength, 50})           \
+      ->Args({kHashBenchmarkLength, 1 << 10})      \
+      ->Args({kHashBenchmarkLength, 10 * 1 << 10}) \
+      ->Args({kHashBenchmarkLength, 1 << 20})      \
+      ->MinTime(1.0)                               \
+      ->Unit(benchmark::kMicrosecond)              \
+      ->UseRealTime()
+
+ADD_HASH_ARGS(BENCHMARK(BM_UniqueInt64NoNulls));
+ADD_HASH_ARGS(BENCHMARK(BM_UniqueInt64WithNulls));
+ADD_HASH_ARGS(BENCHMARK(BM_UniqueString10bytes));
+ADD_HASH_ARGS(BENCHMARK(BM_UniqueString100bytes));
 
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/compute-test.cc b/cpp/src/arrow/compute/compute-test.cc
index c73bfa3..84af8f7 100644
--- a/cpp/src/arrow/compute/compute-test.cc
+++ b/cpp/src/arrow/compute/compute-test.cc
@@ -869,8 +869,8 @@ TYPED_TEST(TestHashKernelPrimitive, PrimitiveResizeTable) {
     return;
   }
 
-  const int64_t kTotalValues = 10000;
-  const int64_t kRepeats = 10;
+  const int64_t kTotalValues = 1000000;
+  const int64_t kRepeats = 5;
 
   vector<T> values;
   vector<T> uniques;
diff --git a/cpp/src/arrow/compute/kernels/hash.cc b/cpp/src/arrow/compute/kernels/hash.cc
index 66c9073..750f1d3 100644
--- a/cpp/src/arrow/compute/kernels/hash.cc
+++ b/cpp/src/arrow/compute/kernels/hash.cc
@@ -43,7 +43,7 @@ typedef int32_t hash_slot_t;
 static constexpr hash_slot_t kHashSlotEmpty = std::numeric_limits<int32_t>::max();
 
 // The maximum load factor for the hash table before resizing.
-static constexpr double kMaxHashTableLoad = 0.7;
+static constexpr double kMaxHashTableLoad = 0.5;
 
 enum class SIMDMode : char { NOSIMD, SSE4, AVX2 };
 
@@ -260,7 +260,7 @@ struct HashDictionary<Type, enable_if_has_c_type<Type>> {
       COMPUTE_HASH;                                                              \
       while (kHashSlotEmpty != new_hash_slots[j]) {                              \
         ++j;                                                                     \
-        if (ARROW_PREDICT_FALSE(j == hash_table_size_)) {                        \
+        if (ARROW_PREDICT_FALSE(j == new_size)) {                                \
           j = 0;                                                                 \
         }                                                                        \
       }                                                                          \

-- 
To stop receiving notification emails like this one, please contact
"commits@arrow.apache.org" <commits@arrow.apache.org>.

Mime
View raw message