kudu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ale...@apache.org
Subject [1/4] kudu git commit: KUDU-1056 and KUDU-1020 Safe time for ksck checksum
Date Thu, 11 May 2017 18:23:09 GMT
Repository: kudu
Updated Branches:
  refs/heads/master ae7cfa04a -> de1daf282


KUDU-1056 and KUDU-1020 Safe time for ksck checksum

Now that safe time works properly, this patch enables
the snapshot checksum tests for ksck.

Change-Id: Ib45be20dcfa37fb85185302adf84d2c4a55f8c1e
Reviewed-on: http://gerrit.cloudera.org:8080/6843
Reviewed-by: David Ribeiro Alves <davidralves@gmail.com>
Tested-by: Will Berkeley <wdberkeley@gmail.com>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/e6141a0a
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/e6141a0a
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/e6141a0a

Branch: refs/heads/master
Commit: e6141a0adf6e3c5a54be8cfdf5acd0f1ff65f714
Parents: ae7cfa0
Author: Will Berkeley <wdberkeley@apache.org>
Authored: Tue May 9 14:28:06 2017 -0700
Committer: David Ribeiro Alves <davidralves@gmail.com>
Committed: Thu May 11 04:18:44 2017 +0000

----------------------------------------------------------------------
 src/kudu/tools/ksck.cc             |  6 +----
 src/kudu/tools/ksck_remote-test.cc | 42 +++++++++++++++------------------
 2 files changed, 20 insertions(+), 28 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/e6141a0a/src/kudu/tools/ksck.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck.cc b/src/kudu/tools/ksck.cc
index 67beac1..cdec7a6 100644
--- a/src/kudu/tools/ksck.cc
+++ b/src/kudu/tools/ksck.cc
@@ -537,11 +537,7 @@ Status Ksck::ChecksumData(const ChecksumOptions& opts) {
                     num_results, num_tablet_replicas);
 
   if (num_mismatches != 0) {
-    // TODO(KUDU-1020): remove the below note once safe time advancement is fully implemented.
-    return Status::Corruption(Substitute(
-        "$0 checksum mismatches were detected. "
-        "NOTE: if the table is actively being written to, this may generate spurious "
-        "checksum mismatches.", num_mismatches));
+    return Status::Corruption(Substitute("$0 checksum mismatches were detected.", num_mismatches));
   }
   if (num_errors != 0) {
     return Status::Aborted(Substitute("$0 errors were detected", num_errors));

http://git-wip-us.apache.org/repos/asf/kudu/blob/e6141a0a/src/kudu/tools/ksck_remote-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck_remote-test.cc b/src/kudu/tools/ksck_remote-test.cc
index 00591af..d57a401 100644
--- a/src/kudu/tools/ksck_remote-test.cc
+++ b/src/kudu/tools/ksck_remote-test.cc
@@ -151,7 +151,12 @@ class RemoteKsckTest : public KuduTest {
       if (!status.ok()) {
         promise->Set(status);
       }
-      started_writing->CountDown(1);
+      // Wait for the first 100 writes so that it's very likely all replicas have committed
a
+      // message in each tablet; otherwise, safe time might not have been updated on all
replicas
+      // and some might refuse snapshot scans because of lag.
+      if (i > 100) {
+        started_writing->CountDown(1);
+      }
     }
     promise->Set(Status::OK());
   }
@@ -231,7 +236,9 @@ TEST_F(RemoteKsckTest, TestChecksum) {
   MonoTime deadline = MonoTime::Now() + MonoDelta::FromSeconds(30);
   Status s;
   while (MonoTime::Now() < deadline) {
+    ASSERT_OK(ksck_->CheckMasterRunning());
     ASSERT_OK(ksck_->FetchTableAndTabletInfo());
+    ASSERT_OK(ksck_->FetchInfoFromTabletServers());
 
     err_stream_.str("");
     s = ksck_->ChecksumData(ChecksumOptions(MonoDelta::FromSeconds(1), 16, false, 0));
@@ -254,7 +261,9 @@ TEST_F(RemoteKsckTest, TestChecksumTimeout) {
   uint64_t num_writes = 10000;
   LOG(INFO) << "Generating row writes...";
   ASSERT_OK(GenerateRowWrites(num_writes));
+  ASSERT_OK(ksck_->CheckMasterRunning());
   ASSERT_OK(ksck_->FetchTableAndTabletInfo());
+  ASSERT_OK(ksck_->FetchInfoFromTabletServers());
   // Use an impossibly low timeout value of zero!
   Status s = ksck_->ChecksumData(ChecksumOptions(MonoDelta::FromNanoseconds(0), 16, false,
0));
   ASSERT_TRUE(s.IsTimedOut()) << "Expected TimedOut Status, got: " << s.ToString();
@@ -273,45 +282,32 @@ TEST_F(RemoteKsckTest, TestChecksumSnapshot) {
   CHECK(started_writing.WaitFor(MonoDelta::FromSeconds(30)));
 
   uint64_t ts = client_->GetLatestObservedTimestamp();
-  MonoTime start(MonoTime::Now());
-  MonoTime deadline = start + MonoDelta::FromSeconds(30);
-  Status s;
-  // TODO: We need to loop here because safe time is not yet implemented.
-  // Remove this loop when that is done. See KUDU-1056.
-  while (true) {
-    ASSERT_OK(ksck_->FetchTableAndTabletInfo());
-    Status s = ksck_->ChecksumData(ChecksumOptions(MonoDelta::FromSeconds(10), 16, true,
ts));
-    if (s.ok()) break;
-    if (MonoTime::Now() > deadline) break;
-    SleepFor(MonoDelta::FromMilliseconds(10));
-  }
-  if (!s.ok()) {
-    LOG(WARNING) << Substitute("Timed out after $0 waiting for ksck to become consistent
on TS $1. "
-                               "Status: $2",
-                               (MonoTime::Now() - start).ToString(),
-                               ts, s.ToString());
-    EXPECT_OK(s); // To avoid ASAN complaints due to thread reading the CountDownLatch.
-  }
+  ASSERT_OK(ksck_->CheckMasterRunning());
+  ASSERT_OK(ksck_->FetchTableAndTabletInfo());
+  ASSERT_OK(ksck_->FetchInfoFromTabletServers());
+  ASSERT_OK(ksck_->ChecksumData(ChecksumOptions(MonoDelta::FromSeconds(10), 16, true,
ts)));
   continue_writing.Store(false);
   ASSERT_OK(promise.Get());
   writer_thread->Join();
 }
 
 // Test that followers & leader wait until safe time to respond to a snapshot
-// scan at current timestamp. TODO: Safe time not yet implemented. See KUDU-1056.
-TEST_F(RemoteKsckTest, DISABLED_TestChecksumSnapshotCurrentTimestamp) {
+// scan at current timestamp.
+TEST_F(RemoteKsckTest, TestChecksumSnapshotCurrentTimestamp) {
   CountDownLatch started_writing(1);
   AtomicBool continue_writing(true);
   Promise<Status> promise;
   scoped_refptr<Thread> writer_thread;
 
-  Thread::Create("RemoteKsckTest", "TestChecksumSnapshot",
+  Thread::Create("RemoteKsckTest", "TestChecksumSnapshotCurrentTimestamp",
                  &RemoteKsckTest::GenerateRowWritesLoop, this,
                  &started_writing, boost::cref(continue_writing), &promise,
                  &writer_thread);
   CHECK(started_writing.WaitFor(MonoDelta::FromSeconds(30)));
 
+  ASSERT_OK(ksck_->CheckMasterRunning());
   ASSERT_OK(ksck_->FetchTableAndTabletInfo());
+  ASSERT_OK(ksck_->FetchInfoFromTabletServers());
   ASSERT_OK(ksck_->ChecksumData(ChecksumOptions(MonoDelta::FromSeconds(10), 16, true,
                                                 ChecksumOptions::kCurrentTimestamp)));
   continue_writing.Store(false);


Mime
View raw message