This is an automated email from the ASF dual-hosted git repository.
abukor pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
commit dff0349f9f9c6dde759bcd97e2f76957d8043df7
Author: Todd Lipcon <todd@apache.org>
AuthorDate: Fri Dec 6 15:12:59 2019 -0800
client: optimize destruction of WriteRpc
When writing batches with lots of operations, the WriteRpc destructor
ends up cache-miss bound, since the various InFlightOp and WriteOps are
strewn all about memory. This adds some prefetching which sped things up
noticeably (~37%) in a benchmark which ends up bound by the reactor thread on
the client side.
$ perf stat ./build/thinlto/bin/kudu perf loadgen localhost -num_rows_per_thread=10000000
-num_threads=8
Before:
Generator report
time total : 51403.6 ms
time per row: 0.000642545 ms
Dropping auto-created table 'default.loadgen_auto_d289807fc12a4b1c861f79b19af9ec8e'
Performance counter stats for './build/thinlto/bin/kudu perf loadgen localhost -num_rows_per_thread=10000000
-num_threads=8':
180,585.24 msec task-clock # 3.508 CPUs utilized
25,373 context-switches # 0.141 K/sec
1,648 cpu-migrations # 0.009 K/sec
50,927 page-faults # 0.282 K/sec
726,022,544,856 cycles # 4.020 GHz
(83.33%)
71,782,315,500 stalled-cycles-frontend # 9.89% frontend cycles idle
(83.36%)
412,273,652,207 stalled-cycles-backend # 56.79% backend cycles idle
(83.29%)
408,271,477,858 instructions # 0.56 insn per cycle
# 1.01 stalled cycles per insn
(83.35%)
75,750,045,948 branches # 419.470 M/sec
(83.33%)
296,247,270 branch-misses # 0.39% of all branches
(83.34%)
51.475433628 seconds time elapsed
178.590913000 seconds user
1.935099000 seconds sys
After:
Generator report
time total : 37293.8 ms
time per row: 0.000466172 ms
Dropping auto-created table 'default.loadgen_auto_ece2f41beef94a9fa032c77899f7e61c'
Performance counter stats for './build/thinlto/bin/kudu perf loadgen localhost -num_rows_per_thread=10000000
-num_threads=8':
189,125.49 msec task-clock # 5.060 CPUs utilized
29,363 context-switches # 0.155 K/sec
2,043 cpu-migrations # 0.011 K/sec
48,405 page-faults # 0.256 K/sec
772,496,448,279 cycles # 4.085 GHz
(83.33%)
129,999,474,226 stalled-cycles-frontend # 16.83% frontend cycles idle
(83.36%)
300,049,388,250 stalled-cycles-backend # 38.84% backend cycles idle
(83.30%)
414,415,517,571 instructions # 0.54 insn per cycle
# 0.72 stalled cycles per insn
(83.32%)
76,829,647,882 branches # 406.236 M/sec
(83.34%)
352,749,453 branch-misses # 0.46% of all branches
(83.35%)
37.376785122 seconds time elapsed
186.834651000 seconds user
2.143945000 seconds sys
Change-Id: I538f995f7ec161e746885c6b31cd1dccd72139b0
Reviewed-on: http://gerrit.cloudera.org:8080/14868
Reviewed-by: Adar Dembo <adar@cloudera.com>
Tested-by: Todd Lipcon <todd@apache.org>
---
src/kudu/client/batcher.cc | 79 +++++++++++++++++++++++++++++++++++++++++--
src/kudu/common/partial_row.h | 4 +++
2 files changed, 80 insertions(+), 3 deletions(-)
diff --git a/src/kudu/client/batcher.cc b/src/kudu/client/batcher.cc
index e6ce9dd..c68e77d 100644
--- a/src/kudu/client/batcher.cc
+++ b/src/kudu/client/batcher.cc
@@ -41,6 +41,7 @@
#include "kudu/client/write_op-internal.h"
#include "kudu/client/write_op.h"
#include "kudu/common/common.pb.h"
+#include "kudu/common/partial_row.h"
#include "kudu/common/partition.h"
#include "kudu/common/row_operations.h"
#include "kudu/common/wire_protocol.h"
@@ -50,7 +51,7 @@
#include "kudu/gutil/bind_helpers.h"
#include "kudu/gutil/gscoped_ptr.h"
#include "kudu/gutil/map-util.h"
-#include "kudu/gutil/stl_util.h"
+#include "kudu/gutil/port.h"
#include "kudu/gutil/strings/substitute.h"
#include "kudu/rpc/connection.h"
#include "kudu/rpc/request_tracker.h"
@@ -76,7 +77,6 @@ using strings::Substitute;
namespace kudu {
-class KuduPartialRow;
class Schema;
namespace rpc {
@@ -342,7 +342,80 @@ WriteRpc::WriteRpc(const scoped_refptr<Batcher>& batcher,
}
WriteRpc::~WriteRpc() {
- STLDeleteElements(&ops_);
+ // Since the WriteRpc is destructed a while after all of the
+ // InFlightOps and other associated objects were last touched,
+ // and because those operations were not all allocated together,
+ // they're likely to be strewn all around in RAM. This function
+ // then ends up cache-miss-bound.
+ //
+ // Ideally, we could change the allocation pattern to make them
+ // more contiguous, but it's a bit tricky -- this is client code,
+ // so we don't really have great control over how the write ops
+ // themselves are allocated.
+ //
+ // So, instead, we do some prefetching. The pointer graph looks like:
+ //
+ // vector<InFlightOp*>
+ // [i] InFlightOp* pointer
+ // \----> InFlightOp instance
+ // | WriteOp* pointer
+ // | \-----> WriteOp instance
+ // | KuduPartialRow (embedded)
+ // | | isset_bitmap_
+ // \-----> heap allocated memory
+ //
+ //
+ // So, we need to do three "layers" of prefetch. First, prefetch the
+ // InFlightOp instance. Then, prefetch the KuduPartialRow contained by
+ // the WriteOp that it points to. Then, prefetch the isset bitmap that
+ // the PartialRow points to.
+ //
+ // In order to get parallelism here, we need to stagger the prefetches:
+ // the "root" of the tree needs to look farthest in the future, then
+ // prefetch the next level, then prefetch the closest level, before
+ // eventually calling the destructor.
+ //
+ // Experimentally, it seems we get enough benefit from only prefetching
+ // one entry "ahead" in between each.
+ constexpr static int kPrefetchDistance = 1;
+ const int size = ops_.size();
+
+ auto iter = [this, size](int i) {
+ int ifo_prefetch = i + kPrefetchDistance * 3;
+ int op_prefetch = i + kPrefetchDistance * 2;
+ int row_prefetch = i + kPrefetchDistance;
+ if (ifo_prefetch >= 0 && ifo_prefetch < size) {
+ __builtin_prefetch(ops_[ifo_prefetch], 0, PREFETCH_HINT_T0);
+ }
+ if (op_prefetch >= 0 && op_prefetch < size) {
+ const auto* op = ops_[op_prefetch]->write_op.get();
+ if (op) {
+ __builtin_prefetch(&op->row().isset_bitmap_, 0, PREFETCH_HINT_T0);
+ }
+ }
+ if (row_prefetch >= 0 && row_prefetch < size) {
+ const auto* op = ops_[row_prefetch]->write_op.get();
+ if (op) {
+ __builtin_prefetch(op->row().isset_bitmap_, 0, PREFETCH_HINT_T0);
+ }
+ }
+ if (i >= 0) {
+ delete ops_[i];
+ }
+ };
+
+ // Explicitly perform "loop splitting" to avoid the branches in the main
+ // body of the loop.
+ int i = -kPrefetchDistance * 3;
+ while (i < 0) {
+ iter(i++);
+ }
+ while (i < size - kPrefetchDistance * 3) {
+ iter(i++);
+ }
+ while (i < size) {
+ iter(i++);
+ }
}
string WriteRpc::ToString() const {
diff --git a/src/kudu/common/partial_row.h b/src/kudu/common/partial_row.h
index 3a50907..d2fc631 100644
--- a/src/kudu/common/partial_row.h
+++ b/src/kudu/common/partial_row.h
@@ -44,6 +44,9 @@ class ColumnSchema;
namespace client {
class ClientTest_TestProjectionPredicatesFuzz_Test;
class KuduWriteOperation;
+namespace internal {
+class WriteRpc;
+} // namespace internal
template<typename KeyTypeWrapper> struct SliceKeysTestSetup;// IWYU pragma: keep
template<typename KeyTypeWrapper> struct IntKeysTestSetup; // IWYU pragma: keep
} // namespace client
@@ -586,6 +589,7 @@ class KUDU_EXPORT KuduPartialRow {
private:
friend class client::KuduWriteOperation; // for row_data_.
+ friend class client::internal::WriteRpc; // for row_data_.
friend class KeyUtilTest;
friend class PartitionSchema;
friend class RowOperationsPBDecoder;
|