From commits-return-8259-archive-asf-public=cust-asf.ponee.io@kudu.apache.org Mon Dec 9 22:34:09 2019 Return-Path: X-Original-To: archive-asf-public@cust-asf.ponee.io Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [207.244.88.153]) by mx-eu-01.ponee.io (Postfix) with SMTP id D9C0818067E for ; Mon, 9 Dec 2019 23:34:08 +0100 (CET) Received: (qmail 43311 invoked by uid 500); 9 Dec 2019 22:34:08 -0000 Mailing-List: contact commits-help@kudu.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@kudu.apache.org Delivered-To: mailing list commits@kudu.apache.org Received: (qmail 43235 invoked by uid 99); 9 Dec 2019 22:34:07 -0000 Received: from ec2-52-202-80-70.compute-1.amazonaws.com (HELO gitbox.apache.org) (52.202.80.70) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 09 Dec 2019 22:34:07 +0000 Received: by gitbox.apache.org (ASF Mail Server at gitbox.apache.org, from userid 33) id C74ED8D80D; Mon, 9 Dec 2019 22:34:07 +0000 (UTC) Date: Mon, 09 Dec 2019 22:34:09 +0000 To: "commits@kudu.apache.org" Subject: [kudu] 02/03: client: optimize destruction of WriteRpc MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit From: abukor@apache.org In-Reply-To: <157593084770.574.3719990368659625480@gitbox.apache.org> References: <157593084770.574.3719990368659625480@gitbox.apache.org> X-Git-Host: gitbox.apache.org X-Git-Repo: kudu X-Git-Refname: refs/heads/master X-Git-Reftype: branch X-Git-Rev: dff0349f9f9c6dde759bcd97e2f76957d8043df7 X-Git-NotificationType: diff X-Git-Multimail-Version: 1.5.dev Auto-Submitted: auto-generated Message-Id: <20191209223407.C74ED8D80D@gitbox.apache.org> This is an automated email from the ASF dual-hosted git repository. abukor pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/kudu.git commit dff0349f9f9c6dde759bcd97e2f76957d8043df7 Author: Todd Lipcon AuthorDate: Fri Dec 6 15:12:59 2019 -0800 client: optimize destruction of WriteRpc When writing batches with lots of operations, the WriteRpc destructor ends up cache-miss bound, since the various InFlightOp and WriteOps are strewn all about memory. This adds some prefetching which sped things up noticeably (~37%) in a benchmark which ends up bound by the reactor thread on the client side. $ perf stat ./build/thinlto/bin/kudu perf loadgen localhost -num_rows_per_thread=10000000 -num_threads=8 Before: Generator report time total : 51403.6 ms time per row: 0.000642545 ms Dropping auto-created table 'default.loadgen_auto_d289807fc12a4b1c861f79b19af9ec8e' Performance counter stats for './build/thinlto/bin/kudu perf loadgen localhost -num_rows_per_thread=10000000 -num_threads=8': 180,585.24 msec task-clock # 3.508 CPUs utilized 25,373 context-switches # 0.141 K/sec 1,648 cpu-migrations # 0.009 K/sec 50,927 page-faults # 0.282 K/sec 726,022,544,856 cycles # 4.020 GHz (83.33%) 71,782,315,500 stalled-cycles-frontend # 9.89% frontend cycles idle (83.36%) 412,273,652,207 stalled-cycles-backend # 56.79% backend cycles idle (83.29%) 408,271,477,858 instructions # 0.56 insn per cycle # 1.01 stalled cycles per insn (83.35%) 75,750,045,948 branches # 419.470 M/sec (83.33%) 296,247,270 branch-misses # 0.39% of all branches (83.34%) 51.475433628 seconds time elapsed 178.590913000 seconds user 1.935099000 seconds sys After: Generator report time total : 37293.8 ms time per row: 0.000466172 ms Dropping auto-created table 'default.loadgen_auto_ece2f41beef94a9fa032c77899f7e61c' Performance counter stats for './build/thinlto/bin/kudu perf loadgen localhost -num_rows_per_thread=10000000 -num_threads=8': 189,125.49 msec task-clock # 5.060 CPUs utilized 29,363 context-switches # 0.155 K/sec 2,043 cpu-migrations # 0.011 K/sec 48,405 page-faults # 0.256 K/sec 772,496,448,279 cycles # 4.085 GHz (83.33%) 129,999,474,226 stalled-cycles-frontend # 16.83% frontend cycles idle (83.36%) 300,049,388,250 stalled-cycles-backend # 38.84% backend cycles idle (83.30%) 414,415,517,571 instructions # 0.54 insn per cycle # 0.72 stalled cycles per insn (83.32%) 76,829,647,882 branches # 406.236 M/sec (83.34%) 352,749,453 branch-misses # 0.46% of all branches (83.35%) 37.376785122 seconds time elapsed 186.834651000 seconds user 2.143945000 seconds sys Change-Id: I538f995f7ec161e746885c6b31cd1dccd72139b0 Reviewed-on: http://gerrit.cloudera.org:8080/14868 Reviewed-by: Adar Dembo Tested-by: Todd Lipcon --- src/kudu/client/batcher.cc | 79 +++++++++++++++++++++++++++++++++++++++++-- src/kudu/common/partial_row.h | 4 +++ 2 files changed, 80 insertions(+), 3 deletions(-) diff --git a/src/kudu/client/batcher.cc b/src/kudu/client/batcher.cc index e6ce9dd..c68e77d 100644 --- a/src/kudu/client/batcher.cc +++ b/src/kudu/client/batcher.cc @@ -41,6 +41,7 @@ #include "kudu/client/write_op-internal.h" #include "kudu/client/write_op.h" #include "kudu/common/common.pb.h" +#include "kudu/common/partial_row.h" #include "kudu/common/partition.h" #include "kudu/common/row_operations.h" #include "kudu/common/wire_protocol.h" @@ -50,7 +51,7 @@ #include "kudu/gutil/bind_helpers.h" #include "kudu/gutil/gscoped_ptr.h" #include "kudu/gutil/map-util.h" -#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/port.h" #include "kudu/gutil/strings/substitute.h" #include "kudu/rpc/connection.h" #include "kudu/rpc/request_tracker.h" @@ -76,7 +77,6 @@ using strings::Substitute; namespace kudu { -class KuduPartialRow; class Schema; namespace rpc { @@ -342,7 +342,80 @@ WriteRpc::WriteRpc(const scoped_refptr& batcher, } WriteRpc::~WriteRpc() { - STLDeleteElements(&ops_); + // Since the WriteRpc is destructed a while after all of the + // InFlightOps and other associated objects were last touched, + // and because those operations were not all allocated together, + // they're likely to be strewn all around in RAM. This function + // then ends up cache-miss-bound. + // + // Ideally, we could change the allocation pattern to make them + // more contiguous, but it's a bit tricky -- this is client code, + // so we don't really have great control over how the write ops + // themselves are allocated. + // + // So, instead, we do some prefetching. The pointer graph looks like: + // + // vector + // [i] InFlightOp* pointer + // \----> InFlightOp instance + // | WriteOp* pointer + // | \-----> WriteOp instance + // | KuduPartialRow (embedded) + // | | isset_bitmap_ + // \-----> heap allocated memory + // + // + // So, we need to do three "layers" of prefetch. First, prefetch the + // InFlightOp instance. Then, prefetch the KuduPartialRow contained by + // the WriteOp that it points to. Then, prefetch the isset bitmap that + // the PartialRow points to. + // + // In order to get parallelism here, we need to stagger the prefetches: + // the "root" of the tree needs to look farthest in the future, then + // prefetch the next level, then prefetch the closest level, before + // eventually calling the destructor. + // + // Experimentally, it seems we get enough benefit from only prefetching + // one entry "ahead" in between each. + constexpr static int kPrefetchDistance = 1; + const int size = ops_.size(); + + auto iter = [this, size](int i) { + int ifo_prefetch = i + kPrefetchDistance * 3; + int op_prefetch = i + kPrefetchDistance * 2; + int row_prefetch = i + kPrefetchDistance; + if (ifo_prefetch >= 0 && ifo_prefetch < size) { + __builtin_prefetch(ops_[ifo_prefetch], 0, PREFETCH_HINT_T0); + } + if (op_prefetch >= 0 && op_prefetch < size) { + const auto* op = ops_[op_prefetch]->write_op.get(); + if (op) { + __builtin_prefetch(&op->row().isset_bitmap_, 0, PREFETCH_HINT_T0); + } + } + if (row_prefetch >= 0 && row_prefetch < size) { + const auto* op = ops_[row_prefetch]->write_op.get(); + if (op) { + __builtin_prefetch(op->row().isset_bitmap_, 0, PREFETCH_HINT_T0); + } + } + if (i >= 0) { + delete ops_[i]; + } + }; + + // Explicitly perform "loop splitting" to avoid the branches in the main + // body of the loop. + int i = -kPrefetchDistance * 3; + while (i < 0) { + iter(i++); + } + while (i < size - kPrefetchDistance * 3) { + iter(i++); + } + while (i < size) { + iter(i++); + } } string WriteRpc::ToString() const { diff --git a/src/kudu/common/partial_row.h b/src/kudu/common/partial_row.h index 3a50907..d2fc631 100644 --- a/src/kudu/common/partial_row.h +++ b/src/kudu/common/partial_row.h @@ -44,6 +44,9 @@ class ColumnSchema; namespace client { class ClientTest_TestProjectionPredicatesFuzz_Test; class KuduWriteOperation; +namespace internal { +class WriteRpc; +} // namespace internal template struct SliceKeysTestSetup;// IWYU pragma: keep template struct IntKeysTestSetup; // IWYU pragma: keep } // namespace client @@ -586,6 +589,7 @@ class KUDU_EXPORT KuduPartialRow { private: friend class client::KuduWriteOperation; // for row_data_. + friend class client::internal::WriteRpc; // for row_data_. friend class KeyUtilTest; friend class PartitionSchema; friend class RowOperationsPBDecoder;