kudu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From t...@apache.org
Subject [2/4] kudu git commit: rowset_metadata: cache min/max encoded keys
Date Thu, 05 Apr 2018 05:19:00 GMT
rowset_metadata: cache min/max encoded keys

This patch adds a new flag rowset_metadata_store_keys that, when true,
will indicate that Kudu should duplicate diskrowset min/max keys into
the rowset metadata. Doing so allows Kudu to read the keys from tablet
metadata and bootstrap tablets without having to fully initializing the
CFileReaders for the key columns of each rowset.

A small test is added to tablet_server-test that ensures we don't read
any extraneous bytes when starting up the tablet server if we're reading
keys from the rowset metadata.

I benchmarked this with ~50GB of flushed YCSB data (92 tablets of
varying sizes) on a single node with 4 data directories and a separate
WAL/metadata directory. To set up, I let the server flush/compact for a
while so bootstrap times wouldn't be dominated by reading WAL segments,
and set rowset_metadata_store_keys to true so the tserver had the option
of reading the cached keys from the rowset metadata at startup.

With the above setup, I started the tserver with a disabled maintenance
manager (to avoid further IO) and waited for the tablets to get to a
RUNNING state, recording the sum of the logged bootstrap times of each
tablet. I repeated this, configuring Kudu to read the keys from the
rowset metadata, and to read the keys from the data blocks, dropping OS
caches in between runs. The results are below.

Run number:                   1           2           3           Avg
Reading cached keys (s):      26.430      24.143      20.826      23.800
Not reading cached keys (s):  40.578      38.428      37.093      38.700

Based on this, ~15 seconds worth of bootstrapping time was spent on
initializing the key index readers, that could be avoided by reading the
keys from the rowset metadata instead.

Change-Id: I37d6f7160e3a7188753684e063963110f70e9b8d
Reviewed-on: http://gerrit.cloudera.org:8080/9372
Tested-by: Kudu Jenkins
Reviewed-by: Todd Lipcon <todd@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/c127477b
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/c127477b
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/c127477b

Branch: refs/heads/master
Commit: c127477b52e9443bfc07c4c30f55b92b74713b1c
Parents: 0402c33
Author: Andrew Wong <awong@cloudera.com>
Authored: Mon Feb 5 13:53:31 2018 -0800
Committer: Todd Lipcon <todd@apache.org>
Committed: Wed Apr 4 05:28:42 2018 +0000

----------------------------------------------------------------------
 src/kudu/tablet/cfile_set.cc           | 45 +++++++++++++++++++----------
 src/kudu/tablet/cfile_set.h            |  4 +--
 src/kudu/tablet/diskrowset.cc          | 11 +++++++
 src/kudu/tablet/metadata.proto         |  2 ++
 src/kudu/tablet/rowset_metadata.cc     | 15 ++++++++++
 src/kudu/tablet/rowset_metadata.h      | 37 ++++++++++++++++++++++++
 src/kudu/tserver/tablet_server-test.cc | 43 +++++++++++++++++++++++++++
 7 files changed, 139 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/c127477b/src/kudu/tablet/cfile_set.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/cfile_set.cc b/src/kudu/tablet/cfile_set.cc
index 12a9fa3..63a28aa 100644
--- a/src/kudu/tablet/cfile_set.cc
+++ b/src/kudu/tablet/cfile_set.cc
@@ -14,15 +14,20 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
+#include "kudu/tablet/cfile_set.h"
 
 #include <algorithm>
 #include <memory>
 #include <ostream>
 #include <string>
+#include <utility>
 #include <vector>
 
+#include <boost/container/flat_map.hpp>
+#include <boost/container/vector.hpp>
 #include <boost/optional/optional.hpp>
 #include <gflags/gflags.h>
+#include <gflags/gflags_declare.h>
 #include <glog/logging.h>
 
 #include "kudu/cfile/bloomfile.h"
@@ -40,11 +45,12 @@
 #include "kudu/fs/block_manager.h"
 #include "kudu/fs/fs_manager.h"
 #include "kudu/gutil/dynamic_annotations.h"
+#include "kudu/gutil/gscoped_ptr.h"
 #include "kudu/gutil/macros.h"
+#include "kudu/gutil/map-util.h"
 #include "kudu/gutil/port.h"
 #include "kudu/gutil/stringprintf.h"
 #include "kudu/gutil/strings/substitute.h"
-#include "kudu/tablet/cfile_set.h"
 #include "kudu/tablet/diskrowset.h"
 #include "kudu/tablet/rowset.h"
 #include "kudu/tablet/rowset_metadata.h"
@@ -56,6 +62,8 @@
 DEFINE_bool(consult_bloom_filters, true, "Whether to consult bloom filters on row presence
checks");
 TAG_FLAG(consult_bloom_filters, hidden);
 
+DECLARE_bool(rowset_metadata_store_keys);
+
 namespace kudu {
 
 class MemTracker;
@@ -145,12 +153,21 @@ Status CFileSet::DoOpen() {
                              &ad_hoc_idx_reader_));
   }
 
-  // However, the key reader should always be fully opened, so that we
-  // can figure out where in the rowset tree we belong.
-  RETURN_NOT_OK(key_index_reader()->Init());
-
-  // Determine the upper and lower key bounds for this CFileSet.
-  RETURN_NOT_OK(LoadMinMaxKeys());
+  // If the user specified to store the min/max keys in the rowset metadata,
+  // fetch them. Otherwise, load the min and max keys from the key reader.
+  if (FLAGS_rowset_metadata_store_keys && rowset_metadata_->has_encoded_keys())
{
+    min_encoded_key_ = rowset_metadata_->min_encoded_key();
+    max_encoded_key_ = rowset_metadata_->max_encoded_key();
+  } else {
+    RETURN_NOT_OK(LoadMinMaxKeys());
+  }
+  // Verify the loaded keys are valid.
+  if (Slice(min_encoded_key_).compare(max_encoded_key_) > 0) {
+    return Status::Corruption(Substitute("Min key $0 > max key $1",
+                                         KUDU_REDACT(Slice(min_encoded_key_).ToDebugString()),
+                                         KUDU_REDACT(Slice(max_encoded_key_).ToDebugString())),
+                              ToString());
+  }
 
   return Status::OK();
 }
@@ -175,20 +192,14 @@ Status CFileSet::OpenBloomReader() {
 }
 
 Status CFileSet::LoadMinMaxKeys() {
-  CFileReader *key_reader = key_index_reader();
+  CFileReader* key_reader = key_index_reader();
+  RETURN_NOT_OK(key_index_reader()->Init());
   if (!key_reader->GetMetadataEntry(DiskRowSet::kMinKeyMetaEntryName, &min_encoded_key_))
{
     return Status::Corruption("No min key found", ToString());
   }
   if (!key_reader->GetMetadataEntry(DiskRowSet::kMaxKeyMetaEntryName, &max_encoded_key_))
{
     return Status::Corruption("No max key found", ToString());
   }
-  if (Slice(min_encoded_key_).compare(max_encoded_key_) > 0) {
-    return Status::Corruption(Substitute("Min key $0 > max key $1",
-                                         KUDU_REDACT(Slice(min_encoded_key_).ToDebugString()),
-                                         KUDU_REDACT(Slice(max_encoded_key_).ToDebugString())),
-                              ToString());
-  }
-
   return Status::OK();
 }
 
@@ -213,6 +224,7 @@ CFileSet::Iterator *CFileSet::NewIterator(const Schema *projection) const
{
 }
 
 Status CFileSet::CountRows(rowid_t *count) const {
+  RETURN_NOT_OK(key_index_reader()->Init());
   return key_index_reader()->CountRows(count);
 }
 
@@ -300,6 +312,7 @@ Status CFileSet::CheckRowPresent(const RowSetKeyProbe &probe, bool
*present,
 }
 
 Status CFileSet::NewKeyIterator(CFileIterator **key_iter) const {
+  RETURN_NOT_OK(key_index_reader()->Init());
   return key_index_reader()->NewIterator(key_iter, CFileReader::CACHE_BLOCK);
 }
 
@@ -351,6 +364,8 @@ Status CFileSet::Iterator::CreateColumnIterators(const ScanSpec* spec)
{
 Status CFileSet::Iterator::Init(ScanSpec *spec) {
   CHECK(!initted_);
 
+  RETURN_NOT_OK(base_data_->CountRows(&row_count_));
+
   // Setup Key Iterator
   CFileIterator *tmp;
   RETURN_NOT_OK(base_data_->NewKeyIterator(&tmp));

http://git-wip-us.apache.org/repos/asf/kudu/blob/c127477b/src/kudu/tablet/cfile_set.h
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/cfile_set.h b/src/kudu/tablet/cfile_set.h
index e107c21..5c533ad 100644
--- a/src/kudu/tablet/cfile_set.h
+++ b/src/kudu/tablet/cfile_set.h
@@ -215,9 +215,7 @@ class CFileSet::Iterator : public ColumnwiseIterator {
         projection_(projection),
         initted_(false),
         cur_idx_(0),
-        prepared_count_(0) {
-    CHECK_OK(base_data_->CountRows(&row_count_));
-  }
+        prepared_count_(0) {}
 
   // Fill in col_iters_ for each of the requested columns.
   Status CreateColumnIterators(const ScanSpec* spec);

http://git-wip-us.apache.org/repos/asf/kudu/blob/c127477b/src/kudu/tablet/diskrowset.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/diskrowset.cc b/src/kudu/tablet/diskrowset.cc
index 43cb51d..29685f4 100644
--- a/src/kudu/tablet/diskrowset.cc
+++ b/src/kudu/tablet/diskrowset.cc
@@ -74,6 +74,11 @@ DEFINE_int32(default_composite_key_index_block_size_bytes, 4096,
              "Block size used for composite key indexes.");
 TAG_FLAG(default_composite_key_index_block_size_bytes, experimental);
 
+DEFINE_bool(rowset_metadata_store_keys, false,
+            "Whether to store the min/max encoded keys in the rowset "
+            "metadata. If false, keys will be read from the data blocks.");
+TAG_FLAG(rowset_metadata_store_keys, experimental);
+
 namespace kudu {
 
 class Mutex;
@@ -185,6 +190,9 @@ Status DiskRowSetWriter::AppendBlock(const RowBlock &block) {
   if (written_count_ == 0) {
     Slice enc_key = schema_->EncodeComparableKey(block.row(0), &last_encoded_key_);
     key_index_writer()->AddMetadataPair(DiskRowSet::kMinKeyMetaEntryName, enc_key);
+    if (FLAGS_rowset_metadata_store_keys) {
+      rowset_metadata_->set_min_encoded_key(enc_key.ToString());
+    }
     last_encoded_key_.clear();
   }
 
@@ -252,6 +260,9 @@ Status DiskRowSetWriter::FinishAndReleaseBlocks(BlockCreationTransaction*
transa
       << "First Key not <= Last key: first_key=" << KUDU_REDACT(first_enc_slice.ToDebugString())
       << "   last_key=" << KUDU_REDACT(last_enc_slice.ToDebugString());
   key_index_writer()->AddMetadataPair(DiskRowSet::kMaxKeyMetaEntryName, last_enc_slice);
+  if (FLAGS_rowset_metadata_store_keys) {
+    rowset_metadata_->set_max_encoded_key(last_enc_slice.ToString());
+  }
 
   // Finish writing the columns themselves.
   RETURN_NOT_OK(col_writer_->FinishAndReleaseBlocks(transaction));

http://git-wip-us.apache.org/repos/asf/kudu/blob/c127477b/src/kudu/tablet/metadata.proto
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/metadata.proto b/src/kudu/tablet/metadata.proto
index 70d8c1f..0c3110d 100644
--- a/src/kudu/tablet/metadata.proto
+++ b/src/kudu/tablet/metadata.proto
@@ -45,6 +45,8 @@ message RowSetDataPB {
   repeated DeltaDataPB undo_deltas = 5;
   optional BlockIdPB bloom_block = 6;
   optional BlockIdPB adhoc_index_block = 7;
+  optional bytes min_encoded_key = 8;
+  optional bytes max_encoded_key = 9;
 }
 
 // State flags indicating whether the tablet is in the middle of being copied

http://git-wip-us.apache.org/repos/asf/kudu/blob/c127477b/src/kudu/tablet/rowset_metadata.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/rowset_metadata.cc b/src/kudu/tablet/rowset_metadata.cc
index 87fa63d..440ef93 100644
--- a/src/kudu/tablet/rowset_metadata.cc
+++ b/src/kudu/tablet/rowset_metadata.cc
@@ -77,6 +77,15 @@ void RowSetMetadata::LoadFromPB(const RowSetDataPB& pb) {
   std::lock_guard<LockType> l(lock_);
   id_ = pb.id();
 
+  // Load the min/max keys.
+  min_encoded_key_ = boost::none;
+  max_encoded_key_ = boost::none;
+  DCHECK_EQ(pb.has_min_encoded_key(), pb.has_max_encoded_key());
+  if (pb.has_min_encoded_key() && pb.has_max_encoded_key()) {
+    min_encoded_key_ = pb.min_encoded_key();
+    max_encoded_key_ = pb.max_encoded_key();
+  }
+
   // Load Bloom File.
   bloom_block_ = BlockId();
   if (pb.has_bloom_block()) {
@@ -148,6 +157,12 @@ void RowSetMetadata::ToProtobuf(RowSetDataPB *pb) {
   if (!adhoc_index_block_.IsNull()) {
     adhoc_index_block_.CopyToPB(pb->mutable_adhoc_index_block());
   }
+
+  // Write the min/max keys.
+  if (has_encoded_keys_unlocked()) {
+    pb->set_min_encoded_key(*min_encoded_key_);
+    pb->set_max_encoded_key(*max_encoded_key_);
+  }
 }
 
 const std::string RowSetMetadata::ToString() const {

http://git-wip-us.apache.org/repos/asf/kudu/blob/c127477b/src/kudu/tablet/rowset_metadata.h
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/rowset_metadata.h b/src/kudu/tablet/rowset_metadata.h
index dfc1ed2..c0d156e 100644
--- a/src/kudu/tablet/rowset_metadata.h
+++ b/src/kudu/tablet/rowset_metadata.h
@@ -22,10 +22,12 @@
 #include <memory>
 #include <mutex>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include <boost/container/flat_map.hpp>
 #include <boost/container/vector.hpp>
+#include <boost/optional/optional.hpp>
 #include <glog/logging.h>
 
 #include "kudu/common/schema.h"
@@ -119,6 +121,37 @@ class RowSetMetadata {
 
   Status CommitUndoDeltaDataBlock(const BlockId& block_id);
 
+  bool has_encoded_keys_unlocked() const {
+    return min_encoded_key_ != boost::none && max_encoded_key_ != boost::none;
+  }
+
+  bool has_encoded_keys() const {
+    std::lock_guard<LockType> l(lock_);
+    return has_encoded_keys_unlocked();
+  }
+
+  void set_min_encoded_key(std::string min_encoded_key) {
+    std::lock_guard<LockType> l(lock_);
+    min_encoded_key_ = std::move(min_encoded_key);
+  }
+
+  void set_max_encoded_key(std::string max_encoded_key) {
+    std::lock_guard<LockType> l(lock_);
+    max_encoded_key_ = std::move(max_encoded_key);
+  }
+
+  std::string min_encoded_key() const {
+    std::lock_guard<LockType> l(lock_);
+    DCHECK(min_encoded_key_ != boost::none);
+    return *min_encoded_key_;
+  }
+
+  std::string max_encoded_key() const {
+    std::lock_guard<LockType> l(lock_);
+    DCHECK(max_encoded_key_ != boost::none);
+    return *max_encoded_key_;
+  }
+
   BlockId bloom_block() const {
     std::lock_guard<LockType> l(lock_);
     return bloom_block_;
@@ -219,6 +252,10 @@ class RowSetMetadata {
   // Protects the below mutable fields.
   mutable LockType lock_;
 
+  // The min and max keys of the rowset.
+  boost::optional<std::string> min_encoded_key_;
+  boost::optional<std::string> max_encoded_key_;
+
   BlockId bloom_block_;
   BlockId adhoc_index_block_;
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/c127477b/src/kudu/tserver/tablet_server-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/tablet_server-test.cc b/src/kudu/tserver/tablet_server-test.cc
index ad0bb04..63b2998 100644
--- a/src/kudu/tserver/tablet_server-test.cc
+++ b/src/kudu/tserver/tablet_server-test.cc
@@ -145,6 +145,7 @@ DEFINE_int32(delete_tablet_bench_num_flushes, 200,
 DECLARE_bool(crash_on_eio);
 DECLARE_bool(enable_maintenance_manager);
 DECLARE_bool(fail_dns_resolution);
+DECLARE_bool(rowset_metadata_store_keys);
 DECLARE_double(env_inject_eio);
 DECLARE_int32(flush_threshold_mb);
 DECLARE_int32(flush_threshold_secs);
@@ -157,6 +158,7 @@ DECLARE_string(block_manager);
 DECLARE_string(env_inject_eio_globs);
 
 // Declare these metrics prototypes for simpler unit testing of their behavior.
+METRIC_DECLARE_counter(block_manager_total_bytes_read);
 METRIC_DECLARE_counter(rows_inserted);
 METRIC_DECLARE_counter(rows_updated);
 METRIC_DECLARE_counter(rows_deleted);
@@ -3224,5 +3226,46 @@ TEST_F(TabletServerTest, TestNoMetricsForTombstonedTablet) {
   }
 }
 
+// Test ensuring that when rowset min/max keys are stored with and read from
+// the rowset metadata, the tablet server doesn't read any blocks when
+// bootstrapping.
+TEST_F(TabletServerTest, TestKeysInRowsetMetadataPreventStartupSeeks) {
+  // Write the min/max keys to the rowset metadata. This gives us the option to
+  // read from the CFile vs from the rowset metadata.
+  FLAGS_rowset_metadata_store_keys = true;
+  InsertTestRowsDirect(0, 100);
+  ASSERT_OK(tablet_replica_->tablet()->Flush());
+  // Disable the maintenance manager so we don't get any seeks from
+  // maintenance operations when we restart.
+  FLAGS_enable_maintenance_manager = false;
+
+  const auto restart_server_and_check_bytes_read = [&] (bool keys_in_rowset_meta) {
+    FLAGS_rowset_metadata_store_keys = keys_in_rowset_meta;
+    // Reset the replica to avoid any lingering references.
+    // Restart the server and wait for the tablet to bootstrap.
+    tablet_replica_.reset();
+    mini_server_->Shutdown();
+
+    ASSERT_OK(mini_server_->Restart());
+    ASSERT_OK(mini_server_->WaitStarted());
+
+    scoped_refptr<Counter> bytes_read_metric =
+        METRIC_block_manager_total_bytes_read.Instantiate(
+            mini_server_->server()->metric_entity());
+    int64_t bm_bytes_read = bytes_read_metric->value();
+    if (keys_in_rowset_meta) {
+      ASSERT_EQ(0, bm_bytes_read);
+    } else {
+      ASSERT_LT(0, bm_bytes_read);
+    }
+  };
+
+  // Test both reading and not reading the keys from the rowset metadata,
+  // making sure we read bytes in the block manager only when expected (no
+  // bytes should be read by the BM if storing keys in the rowset metadata).
+  restart_server_and_check_bytes_read(/*keys_in_rowset_meta=*/ false);
+  restart_server_and_check_bytes_read(/*keys_in_rowset_meta=*/ true);
+}
+
 } // namespace tserver
 } // namespace kudu


Mime
View raw message