From commits-return-8259-archive-asf-public=cust-asf.ponee.io@kudu.apache.org  Mon Dec  9 22:34:09 2019
Return-Path: <commits-return-8259-archive-asf-public=cust-asf.ponee.io@kudu.apache.org>
X-Original-To: archive-asf-public@cust-asf.ponee.io
Delivered-To: archive-asf-public@cust-asf.ponee.io
Received: from mail.apache.org (hermes.apache.org [207.244.88.153])
	by mx-eu-01.ponee.io (Postfix) with SMTP id D9C0818067E
	for <archive-asf-public@cust-asf.ponee.io>; Mon,  9 Dec 2019 23:34:08 +0100 (CET)
Received: (qmail 43311 invoked by uid 500); 9 Dec 2019 22:34:08 -0000
Mailing-List: contact commits-help@kudu.apache.org; run by ezmlm
Precedence: bulk
List-Help: <mailto:commits-help@kudu.apache.org>
List-Unsubscribe: <mailto:commits-unsubscribe@kudu.apache.org>
List-Post: <mailto:commits@kudu.apache.org>
List-Id: <commits.kudu.apache.org>
Reply-To: dev@kudu.apache.org
Delivered-To: mailing list commits@kudu.apache.org
Received: (qmail 43235 invoked by uid 99); 9 Dec 2019 22:34:07 -0000
Received: from ec2-52-202-80-70.compute-1.amazonaws.com (HELO gitbox.apache.org) (52.202.80.70)
    by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 09 Dec 2019 22:34:07 +0000
Received: by gitbox.apache.org (ASF Mail Server at gitbox.apache.org, from userid 33)
	id C74ED8D80D; Mon,  9 Dec 2019 22:34:07 +0000 (UTC)
Date: Mon, 09 Dec 2019 22:34:09 +0000
To: "commits@kudu.apache.org" <commits@kudu.apache.org>
Subject: [kudu] 02/03: client: optimize destruction of WriteRpc
MIME-Version: 1.0
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: 8bit
From: abukor@apache.org
In-Reply-To: <157593084770.574.3719990368659625480@gitbox.apache.org>
References: <157593084770.574.3719990368659625480@gitbox.apache.org>
X-Git-Host: gitbox.apache.org
X-Git-Repo: kudu
X-Git-Refname: refs/heads/master
X-Git-Reftype: branch
X-Git-Rev: dff0349f9f9c6dde759bcd97e2f76957d8043df7
X-Git-NotificationType: diff
X-Git-Multimail-Version: 1.5.dev
Auto-Submitted: auto-generated
Message-Id: <20191209223407.C74ED8D80D@gitbox.apache.org>

This is an automated email from the ASF dual-hosted git repository.

abukor pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit dff0349f9f9c6dde759bcd97e2f76957d8043df7
Author: Todd Lipcon <todd@apache.org>
AuthorDate: Fri Dec 6 15:12:59 2019 -0800

    client: optimize destruction of WriteRpc
    
    When writing batches with lots of operations, the WriteRpc destructor
    ends up cache-miss bound, since the various InFlightOp and WriteOps are
    strewn all about memory. This adds some prefetching which sped things up
    noticeably (~37%) in a benchmark which ends up bound by the reactor thread on
    the client side.
    
      $ perf stat ./build/thinlto/bin/kudu perf loadgen localhost -num_rows_per_thread=10000000 -num_threads=8
    
    Before:
      Generator report
        time total  : 51403.6 ms
        time per row: 0.000642545 ms
      Dropping auto-created table 'default.loadgen_auto_d289807fc12a4b1c861f79b19af9ec8e'
    
       Performance counter stats for './build/thinlto/bin/kudu perf loadgen localhost -num_rows_per_thread=10000000 -num_threads=8':
    
              180,585.24 msec task-clock                #    3.508 CPUs utilized
                  25,373      context-switches          #    0.141 K/sec
                   1,648      cpu-migrations            #    0.009 K/sec
                  50,927      page-faults               #    0.282 K/sec
         726,022,544,856      cycles                    #    4.020 GHz                      (83.33%)
          71,782,315,500      stalled-cycles-frontend   #    9.89% frontend cycles idle     (83.36%)
         412,273,652,207      stalled-cycles-backend    #   56.79% backend cycles idle      (83.29%)
         408,271,477,858      instructions              #    0.56  insn per cycle
                                                        #    1.01  stalled cycles per insn  (83.35%)
          75,750,045,948      branches                  #  419.470 M/sec                    (83.33%)
             296,247,270      branch-misses             #    0.39% of all branches          (83.34%)
    
            51.475433628 seconds time elapsed
    
           178.590913000 seconds user
             1.935099000 seconds sys
    
    After:
      Generator report
        time total  : 37293.8 ms
        time per row: 0.000466172 ms
      Dropping auto-created table 'default.loadgen_auto_ece2f41beef94a9fa032c77899f7e61c'
    
       Performance counter stats for './build/thinlto/bin/kudu perf loadgen localhost -num_rows_per_thread=10000000 -num_threads=8':
    
              189,125.49 msec task-clock                #    5.060 CPUs utilized
                  29,363      context-switches          #    0.155 K/sec
                   2,043      cpu-migrations            #    0.011 K/sec
                  48,405      page-faults               #    0.256 K/sec
         772,496,448,279      cycles                    #    4.085 GHz                      (83.33%)
         129,999,474,226      stalled-cycles-frontend   #   16.83% frontend cycles idle     (83.36%)
         300,049,388,250      stalled-cycles-backend    #   38.84% backend cycles idle      (83.30%)
         414,415,517,571      instructions              #    0.54  insn per cycle
                                                        #    0.72  stalled cycles per insn  (83.32%)
          76,829,647,882      branches                  #  406.236 M/sec                    (83.34%)
             352,749,453      branch-misses             #    0.46% of all branches          (83.35%)
    
            37.376785122 seconds time elapsed
    
           186.834651000 seconds user
             2.143945000 seconds sys
    
    Change-Id: I538f995f7ec161e746885c6b31cd1dccd72139b0
    Reviewed-on: http://gerrit.cloudera.org:8080/14868
    Reviewed-by: Adar Dembo <adar@cloudera.com>
    Tested-by: Todd Lipcon <todd@apache.org>
---
 src/kudu/client/batcher.cc    | 79 +++++++++++++++++++++++++++++++++++++++++--
 src/kudu/common/partial_row.h |  4 +++
 2 files changed, 80 insertions(+), 3 deletions(-)

diff --git a/src/kudu/client/batcher.cc b/src/kudu/client/batcher.cc
index e6ce9dd..c68e77d 100644
--- a/src/kudu/client/batcher.cc
+++ b/src/kudu/client/batcher.cc
@@ -41,6 +41,7 @@
 #include "kudu/client/write_op-internal.h"
 #include "kudu/client/write_op.h"
 #include "kudu/common/common.pb.h"
+#include "kudu/common/partial_row.h"
 #include "kudu/common/partition.h"
 #include "kudu/common/row_operations.h"
 #include "kudu/common/wire_protocol.h"
@@ -50,7 +51,7 @@
 #include "kudu/gutil/bind_helpers.h"
 #include "kudu/gutil/gscoped_ptr.h"
 #include "kudu/gutil/map-util.h"
-#include "kudu/gutil/stl_util.h"
+#include "kudu/gutil/port.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/rpc/connection.h"
 #include "kudu/rpc/request_tracker.h"
@@ -76,7 +77,6 @@ using strings::Substitute;
 
 namespace kudu {
 
-class KuduPartialRow;
 class Schema;
 
 namespace rpc {
@@ -342,7 +342,80 @@ WriteRpc::WriteRpc(const scoped_refptr<Batcher>& batcher,
 }
 
 WriteRpc::~WriteRpc() {
-  STLDeleteElements(&ops_);
+  // Since the WriteRpc is destructed a while after all of the
+  // InFlightOps and other associated objects were last touched,
+  // and because those operations were not all allocated together,
+  // they're likely to be strewn all around in RAM. This function
+  // then ends up cache-miss-bound.
+  //
+  // Ideally, we could change the allocation pattern to make them
+  // more contiguous, but it's a bit tricky -- this is client code,
+  // so we don't really have great control over how the write ops
+  // themselves are allocated.
+  //
+  // So, instead, we do some prefetching. The pointer graph looks like:
+  //
+  // vector<InFlightOp*>
+  //    [i] InFlightOp* pointer
+  //         \----> InFlightOp instance
+  //                | WriteOp* pointer
+  //                |   \-----> WriteOp instance
+  //                            | KuduPartialRow (embedded)
+  //                            | | isset_bitmap_
+  //                                   \-----> heap allocated memory
+  //
+  //
+  // So, we need to do three "layers" of prefetch. First, prefetch the
+  // InFlightOp instance. Then, prefetch the KuduPartialRow contained by
+  // the WriteOp that it points to. Then, prefetch the isset bitmap that
+  // the PartialRow points to.
+  //
+  // In order to get parallelism here, we need to stagger the prefetches:
+  // the "root" of the tree needs to look farthest in the future, then
+  // prefetch the next level, then prefetch the closest level, before
+  // eventually calling the destructor.
+  //
+  // Experimentally, it seems we get enough benefit from only prefetching
+  // one entry "ahead" in between each.
+  constexpr static int kPrefetchDistance = 1;
+  const int size = ops_.size();
+
+  auto iter = [this, size](int i) {
+    int ifo_prefetch = i + kPrefetchDistance * 3;
+    int op_prefetch = i + kPrefetchDistance * 2;
+    int row_prefetch = i + kPrefetchDistance;
+    if (ifo_prefetch >= 0 && ifo_prefetch < size) {
+      __builtin_prefetch(ops_[ifo_prefetch], 0, PREFETCH_HINT_T0);
+    }
+    if (op_prefetch >= 0 && op_prefetch < size) {
+      const auto* op = ops_[op_prefetch]->write_op.get();
+      if (op) {
+        __builtin_prefetch(&op->row().isset_bitmap_, 0, PREFETCH_HINT_T0);
+      }
+    }
+    if (row_prefetch >= 0 && row_prefetch < size) {
+      const auto* op = ops_[row_prefetch]->write_op.get();
+      if (op) {
+        __builtin_prefetch(op->row().isset_bitmap_, 0, PREFETCH_HINT_T0);
+      }
+    }
+    if (i >= 0) {
+      delete ops_[i];
+    }
+  };
+
+  // Explicitly perform "loop splitting" to avoid the branches in the main
+  // body of the loop.
+  int i = -kPrefetchDistance * 3;
+  while (i < 0) {
+    iter(i++);
+  }
+  while (i < size - kPrefetchDistance * 3) {
+    iter(i++);
+  }
+  while (i < size) {
+    iter(i++);
+  }
 }
 
 string WriteRpc::ToString() const {
diff --git a/src/kudu/common/partial_row.h b/src/kudu/common/partial_row.h
index 3a50907..d2fc631 100644
--- a/src/kudu/common/partial_row.h
+++ b/src/kudu/common/partial_row.h
@@ -44,6 +44,9 @@ class ColumnSchema;
 namespace client {
 class ClientTest_TestProjectionPredicatesFuzz_Test;
 class KuduWriteOperation;
+namespace internal {
+class WriteRpc;
+} // namespace internal
 template<typename KeyTypeWrapper> struct SliceKeysTestSetup;// IWYU pragma: keep
 template<typename KeyTypeWrapper> struct IntKeysTestSetup;  // IWYU pragma: keep
 } // namespace client
@@ -586,6 +589,7 @@ class KUDU_EXPORT KuduPartialRow {
 
  private:
   friend class client::KuduWriteOperation;   // for row_data_.
+  friend class client::internal::WriteRpc;   // for row_data_.
   friend class KeyUtilTest;
   friend class PartitionSchema;
   friend class RowOperationsPBDecoder;