kudu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From t...@apache.org
Subject [1/3] kudu git commit: cfile-test: add a 100M-string file with low cardinality
Date Wed, 05 Apr 2017 23:11:30 GMT
Repository: kudu
Updated Branches:
  refs/heads/master 34310a8e9 -> 27b3de7ab


cfile-test: add a 100M-string file with low cardinality

This serves as a better benchmark of the dictionary-encoded string path.

Change-Id: I9c92f3b6c04c4c2ef50497ad1dc1380b7152d9e9
Reviewed-on: http://gerrit.cloudera.org:8080/6432
Tested-by: Kudu Jenkins
Reviewed-by: David Ribeiro Alves <dralves@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/516d67eb
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/516d67eb
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/516d67eb

Branch: refs/heads/master
Commit: 516d67eb119d7e1b361eb017a00909f89b645e13
Parents: 34310a8
Author: Todd Lipcon <todd@apache.org>
Authored: Sun Mar 19 16:15:21 2017 -0700
Committer: Todd Lipcon <todd@apache.org>
Committed: Wed Apr 5 19:20:13 2017 +0000

----------------------------------------------------------------------
 src/kudu/cfile/cfile-test-base.h |  6 +++--
 src/kudu/cfile/cfile-test.cc     | 46 ++++++++++++++++++++---------------
 src/kudu/tablet/compaction.cc    |  6 ++++-
 3 files changed, 36 insertions(+), 22 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/516d67eb/src/kudu/cfile/cfile-test-base.h
----------------------------------------------------------------------
diff --git a/src/kudu/cfile/cfile-test-base.h b/src/kudu/cfile/cfile-test-base.h
index 695cd99..58b995d 100644
--- a/src/kudu/cfile/cfile-test-base.h
+++ b/src/kudu/cfile/cfile-test-base.h
@@ -370,8 +370,10 @@ class CFileTestBase : public KuduTest {
 
     ASSERT_OK(w.Start());
 
-    // Append given number of values to the test tree
-    const size_t kBufferSize = 8192;
+    // Append given number of values to the test tree. We use 100 to match
+    // the output block size of compaction (kCompactionOutputBlockNumRows in
+    // compaction.cc, unfortunately not linkable from the cfile/ module)
+    const size_t kBufferSize = 100;
     size_t i = 0;
     while (i < num_entries) {
       int towrite = std::min(num_entries - i, kBufferSize);

http://git-wip-us.apache.org/repos/asf/kudu/blob/516d67eb/src/kudu/cfile/cfile-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/cfile/cfile-test.cc b/src/kudu/cfile/cfile-test.cc
index 34b5125..7fa56f5 100644
--- a/src/kudu/cfile/cfile-test.cc
+++ b/src/kudu/cfile/cfile-test.cc
@@ -30,6 +30,7 @@
 #include "kudu/fs/fs-test-util.h"
 #include "kudu/gutil/gscoped_ptr.h"
 #include "kudu/gutil/stringprintf.h"
+#include "kudu/gutil/strings/substitute.h"
 #include "kudu/util/metrics.h"
 #include "kudu/util/stopwatch.h"
 #include "kudu/util/test_macros.h"
@@ -48,6 +49,7 @@ METRIC_DECLARE_entity(server);
 
 using std::shared_ptr;
 using std::unique_ptr;
+using strings::Substitute;
 
 namespace kudu {
 namespace cfile {
@@ -291,6 +293,24 @@ class TestCFile : public CFileTestBase {
   }
 #endif
 
+  void TestWriteDictEncodingLowCardinalityStrings(int64_t num_rows) {
+    BlockId block_id;
+    LOG_TIMING(INFO, Substitute("writing $0 strings with dupes", num_rows)) {
+      LOG(INFO) << "Starting writefile";
+      // The second parameter specify how many distinct strings are there
+      DuplicateStringDataGenerator<false> generator("hello %zu", 256);
+      WriteTestFile(&generator, DICT_ENCODING, NO_COMPRESSION, num_rows, NO_FLAGS, &block_id);
+      LOG(INFO) << "Done writing";
+    }
+
+    LOG_TIMING(INFO, Substitute("reading $0 strings with dupes", num_rows)) {
+      LOG(INFO) << "Starting readfile";
+      size_t n;
+      TimeReadFile(fs_manager_.get(), block_id, &n);
+      ASSERT_EQ(num_rows, n);
+      LOG(INFO) << "End readfile";
+    }
+  }
 };
 
 // Subclass of TestCFile which is parameterized on the block cache type.
@@ -393,10 +413,14 @@ TEST_P(TestCFileBothCacheTypes, TestWrite100MFileStringsPrefixEncoding)
{
   TestWrite100MFileStrings(PREFIX_ENCODING);
 }
 
-TEST_P(TestCFileBothCacheTypes, TestWrite100MFileStringsDictEncoding) {
+TEST_P(TestCFileBothCacheTypes, TestWrite100MUniqueStringsDictEncoding) {
   TestWrite100MFileStrings(DICT_ENCODING);
 }
 
+TEST_P(TestCFileBothCacheTypes, TestWrite100MLowCardinalityStringsDictEncoding) {
+  TestWriteDictEncodingLowCardinalityStrings(100 * 1e6);
+}
+
 TEST_P(TestCFileBothCacheTypes, TestWrite100MFileStringsPlainEncoding) {
   TestWrite100MFileStrings(PLAIN_ENCODING);
 }
@@ -423,24 +447,8 @@ TEST_P(TestCFileBothCacheTypes, TestWrite1MUniqueFileStringsDictEncoding)
{
 }
 
 // Write and Read 1 million strings, which contains duplicates with dictionary encoding
-TEST_P(TestCFileBothCacheTypes, TestWrite1MDuplicateFileStringsDictEncoding) {
-  BlockId block_id;
-  LOG_TIMING(INFO, "writing 1M duplicate strings") {
-    LOG(INFO) << "Starting writefile";
-
-    // The second parameter specify how many distinct strings are there
-    DuplicateStringDataGenerator<false> generator("hello %zu", 256);
-    WriteTestFile(&generator, DICT_ENCODING, NO_COMPRESSION, 1000000, NO_FLAGS, &block_id);
-    LOG(INFO) << "Done writing";
-  }
-
-  LOG_TIMING(INFO, "reading 1M strings") {
-    LOG(INFO) << "Starting readfile";
-    size_t n;
-    TimeReadFile(fs_manager_.get(), block_id, &n);
-    ASSERT_EQ(1000000, n);
-    LOG(INFO) << "End readfile";
-  }
+TEST_P(TestCFileBothCacheTypes, TestWrite1MLowCardinalityStringsDictEncoding) {
+  TestWriteDictEncodingLowCardinalityStrings(1000000);
 }
 
 TEST_P(TestCFileBothCacheTypes, TestReadWriteUInt32) {

http://git-wip-us.apache.org/repos/asf/kudu/blob/516d67eb/src/kudu/tablet/compaction.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/compaction.cc b/src/kudu/tablet/compaction.cc
index a851c10..00ba3e8 100644
--- a/src/kudu/tablet/compaction.cc
+++ b/src/kudu/tablet/compaction.cc
@@ -50,6 +50,10 @@ namespace tablet {
 
 namespace {
 
+// The maximum number of rows we will output at a time during
+// compaction.
+const int kCompactionOutputBlockNumRows = 100;
+
 // Advances to the last mutation in a mutation list.
 void AdvanceToLastInList(const Mutation** m) {
   if (*m == nullptr) return;
@@ -1057,7 +1061,7 @@ Status FlushCompactionInput(CompactionInput* input,
 
   DCHECK(out->schema().has_column_ids());
 
-  RowBlock block(out->schema(), 100, nullptr);
+  RowBlock block(out->schema(), kCompactionOutputBlockNumRows, nullptr);
 
   while (input->HasMoreBlocks()) {
     RETURN_NOT_OK(input->PrepareBlock(&rows));


Mime
View raw message