arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject arrow git commit: ARROW-1290: [C++] Double buffer size when exceeding capacity in arrow::BufferBuilder as in array builders
Date Fri, 28 Jul 2017 15:13:44 GMT
Repository: arrow
Updated Branches:
  refs/heads/master 33c85cd03 -> 4df2a0bfa


ARROW-1290: [C++] Double buffer size when exceeding capacity in arrow::BufferBuilder as in
array builders

Kind of an embarrassing oversight, but it's good that we caught it.

In a test for incrementally building a BinaryArray, this yields about 4x speedup

```
Benchmark                                     Time           CPU Iterations
---------------------------------------------------------------------------
BM_BuildBinaryArray/repeats:3             11892 us      11892 us         59   840.886MB/s
BM_BuildBinaryArray/repeats:3             11903 us      11904 us         59   840.082MB/s
BM_BuildBinaryArray/repeats:3             11909 us      11910 us         59   839.662MB/s
BM_BuildBinaryArray/repeats:3_mean        11902 us      11902 us         59    840.21MB/s
BM_BuildBinaryArray/repeats:3_stddev          7 us          7 us          0   520.137kB/s
```

before:

```
Benchmark                                     Time           CPU Iterations
---------------------------------------------------------------------------
BM_BuildBinaryArray/repeats:3             45678 us      45571 us         15   219.439MB/s
BM_BuildBinaryArray/repeats:3             45416 us      45209 us         15   221.197MB/s
BM_BuildBinaryArray/repeats:3             45227 us      45122 us         15   221.619MB/s
BM_BuildBinaryArray/repeats:3_mean        45440 us      45301 us         15   220.752MB/s
BM_BuildBinaryArray/repeats:3_stddev        185 us        194 us          0   966.716kB/s
```

Author: Wes McKinney <wes.mckinney@twosigma.com>

Closes #905 from wesm/ARROW-1290 and squashes the following commits:

59d4d9cd [Wes McKinney] Double buffer size when exceeding capacity in arrow::BufferBuilder,
like in other array builder classes


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/4df2a0bf
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/4df2a0bf
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/4df2a0bf

Branch: refs/heads/master
Commit: 4df2a0bfa15ac28f794a18543801a0e79194617b
Parents: 33c85cd
Author: Wes McKinney <wes.mckinney@twosigma.com>
Authored: Fri Jul 28 11:13:39 2017 -0400
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Fri Jul 28 11:13:39 2017 -0400

----------------------------------------------------------------------
 cpp/src/arrow/buffer.h             |  7 +++++--
 cpp/src/arrow/builder-benchmark.cc | 18 ++++++++++++++++++
 cpp/src/arrow/python/config.cc     |  1 +
 cpp/src/plasma/store.cc            |  5 +++--
 4 files changed, 27 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/4df2a0bf/cpp/src/arrow/buffer.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h
index 09e539d..5d050b7 100644
--- a/cpp/src/arrow/buffer.h
+++ b/cpp/src/arrow/buffer.h
@@ -25,6 +25,7 @@
 #include <string>
 
 #include "arrow/status.h"
+#include "arrow/util/bit-util.h"
 #include "arrow/util/macros.h"
 #include "arrow/util/visibility.h"
 
@@ -204,7 +205,8 @@ class ARROW_EXPORT BufferBuilder {
 
   Status Append(const uint8_t* data, int64_t length) {
     if (capacity_ < length + size_) {
-      RETURN_NOT_OK(Resize(length + size_));
+      int64_t new_capacity = BitUtil::NextPower2(length + size_);
+      RETURN_NOT_OK(Resize(new_capacity));
     }
     UnsafeAppend(data, length);
     return Status::OK();
@@ -213,7 +215,8 @@ class ARROW_EXPORT BufferBuilder {
   // Advance pointer and zero out memory
   Status Advance(int64_t length) {
     if (capacity_ < length + size_) {
-      RETURN_NOT_OK(Resize(length + size_));
+      int64_t new_capacity = BitUtil::NextPower2(length + size_);
+      RETURN_NOT_OK(Resize(new_capacity));
     }
     memset(data_ + size_, 0, static_cast<size_t>(length));
     size_ += length;

http://git-wip-us.apache.org/repos/asf/arrow/blob/4df2a0bf/cpp/src/arrow/builder-benchmark.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder-benchmark.cc b/cpp/src/arrow/builder-benchmark.cc
index 8ba9360..13d7b20 100644
--- a/cpp/src/arrow/builder-benchmark.cc
+++ b/cpp/src/arrow/builder-benchmark.cc
@@ -156,6 +156,22 @@ static void BM_BuildStringDictionary(
                           sizeof(int32_t));
 }
 
+static void BM_BuildBinaryArray(benchmark::State& state) {  // NOLINT non-const reference
+  const int64_t iterations = 1 << 20;
+
+  std::string value = "1234567890";
+  while (state.KeepRunning()) {
+    BinaryBuilder builder(default_memory_pool());
+    for (int64_t i = 0; i < iterations; i++) {
+      ABORT_NOT_OK(builder.Append(value));
+    }
+    std::shared_ptr<Array> out;
+    ABORT_NOT_OK(builder.Finish(&out));
+  }
+  // Assuming a string here needs on average 2 bytes
+  state.SetBytesProcessed(state.iterations() * iterations * value.size());
+}
+
 BENCHMARK(BM_BuildPrimitiveArrayNoNulls)->Repetitions(3)->Unit(benchmark::kMicrosecond);
 BENCHMARK(BM_BuildVectorNoNulls)->Repetitions(3)->Unit(benchmark::kMicrosecond);
 BENCHMARK(BM_BuildAdaptiveIntNoNulls)->Repetitions(3)->Unit(benchmark::kMicrosecond);
@@ -166,4 +182,6 @@ BENCHMARK(BM_BuildAdaptiveUIntNoNulls)->Repetitions(3)->Unit(benchmark::kMicrose
 BENCHMARK(BM_BuildDictionary)->Repetitions(3)->Unit(benchmark::kMicrosecond);
 BENCHMARK(BM_BuildStringDictionary)->Repetitions(3)->Unit(benchmark::kMicrosecond);
 
+BENCHMARK(BM_BuildBinaryArray)->Repetitions(3)->Unit(benchmark::kMicrosecond);
+
 }  // namespace arrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/4df2a0bf/cpp/src/arrow/python/config.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/config.cc b/cpp/src/arrow/python/config.cc
index 92ca9db..bda7a7a 100644
--- a/cpp/src/arrow/python/config.cc
+++ b/cpp/src/arrow/python/config.cc
@@ -16,6 +16,7 @@
 // under the License.
 
 #include "arrow/python/platform.h"
+
 #include "arrow/python/config.h"
 
 namespace arrow {

http://git-wip-us.apache.org/repos/asf/arrow/blob/4df2a0bf/cpp/src/plasma/store.cc
----------------------------------------------------------------------
diff --git a/cpp/src/plasma/store.cc b/cpp/src/plasma/store.cc
index 34adc62..a9425b6 100644
--- a/cpp/src/plasma/store.cc
+++ b/cpp/src/plasma/store.cc
@@ -690,8 +690,9 @@ int main(int argc, char* argv[]) {
   close(shm_fd);
   if (system_memory > shm_mem_avail) {
     ARROW_LOG(FATAL) << "System memory request exceeds memory available in /dev/shm.
The "
-                        "request is for " << system_memory
-                     << " bytes, and the amount available is " << shm_mem_avail
+                        "request is for "
+                     << system_memory << " bytes, and the amount available is
"
+                     << shm_mem_avail
                      << " bytes. You may be able to free up space by deleting files
in "
                         "/dev/shm. If you are inside a Docker container, you may need to
"
                         "pass "


Mime
View raw message