kudu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From t...@apache.org
Subject [2/2] incubator-kudu git commit: Fix tombstoning a tombstoned tablet with the CAS option specified
Date Wed, 06 Jan 2016 23:45:52 GMT
Fix tombstoning a tombstoned tablet with the CAS option specified

This is a bug fix for an issue found by Binglin where the master
attempted to tombstone a tablet that had already been tombstoned, and
retried forever. This can happen when an RPC requesting a tombstone
operation times out, but was successful.

Improved the CAS delete unit test to cover this case.

Change-Id: I32ab0d4487378c43fdb77034eb0cffe68c287373
Reviewed-on: http://gerrit.cloudera.org:8080/1685
Tested-by: Internal Jenkins
Reviewed-by: Todd Lipcon <todd@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/incubator-kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-kudu/commit/bf1f244f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-kudu/tree/bf1f244f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-kudu/diff/bf1f244f

Branch: refs/heads/master
Commit: bf1f244fb740664027cea3d7e32c213e04f4041c
Parents: f85ee5f
Author: Mike Percy <mpercy@cloudera.com>
Authored: Wed Dec 23 21:19:45 2015 -0800
Committer: Todd Lipcon <todd@cloudera.com>
Committed: Tue Jan 5 21:39:41 2016 +0000

----------------------------------------------------------------------
 src/kudu/integration-tests/delete_table-test.cc | 12 ++++++++++++
 src/kudu/master/catalog_manager.cc              |  4 +++-
 src/kudu/tserver/ts_tablet_manager.cc           |  7 ++++++-
 3 files changed, 21 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/bf1f244f/src/kudu/integration-tests/delete_table-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/delete_table-test.cc b/src/kudu/integration-tests/delete_table-test.cc
index 83b8b41..ce46432 100644
--- a/src/kudu/integration-tests/delete_table-test.cc
+++ b/src/kudu/integration-tests/delete_table-test.cc
@@ -426,6 +426,18 @@ TEST_F(DeleteTableTest, TestAtomicDeleteTablet) {
   opid_index = -1;
   ASSERT_OK(itest::DeleteTablet(ts, tablet_id, TABLET_DATA_TOMBSTONED, opid_index, timeout,
                                 &error_code));
+  inspect_->CheckTabletDataStateOnTS(kTsIndex, tablet_id, TABLET_DATA_TOMBSTONED);
+
+  // Now that the tablet is already tombstoned, our opid_index should be
+  // ignored (because it's impossible to check it).
+  ASSERT_OK(itest::DeleteTablet(ts, tablet_id, TABLET_DATA_TOMBSTONED, -9999, timeout,
+                                &error_code));
+  inspect_->CheckTabletDataStateOnTS(kTsIndex, tablet_id, TABLET_DATA_TOMBSTONED);
+
+  // Same with TOMBSTONED -> DELETED.
+  ASSERT_OK(itest::DeleteTablet(ts, tablet_id, TABLET_DATA_DELETED, -9999, timeout,
+                                &error_code));
+  inspect_->CheckTabletDataStateOnTS(kTsIndex, tablet_id, TABLET_DATA_DELETED);
 }
 
 TEST_F(DeleteTableTest, TestDeleteTableWithConcurrentWrites) {

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/bf1f244f/src/kudu/master/catalog_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/master/catalog_manager.cc b/src/kudu/master/catalog_manager.cc
index 9723f18..ace574b 100644
--- a/src/kudu/master/catalog_manager.cc
+++ b/src/kudu/master/catalog_manager.cc
@@ -2108,7 +2108,8 @@ class AsyncDeleteReplica : public RetrySpecificTSRpcTask {
       Status status = StatusFromPB(resp_.error().status());
 
       // Do not retry on a fatal error
-      switch (resp_.error().code()) {
+      TabletServerErrorPB::Code code = resp_.error().code();
+      switch (code) {
         case TabletServerErrorPB::TABLET_NOT_FOUND:
           LOG(WARNING) << "TS " << permanent_uuid_ << ": delete failed
for tablet " << tablet_id_
                        << " because the tablet was not found. No further retry: "
@@ -2122,6 +2123,7 @@ class AsyncDeleteReplica : public RetrySpecificTSRpcTask {
           break;
         default:
           LOG(WARNING) << "TS " << permanent_uuid_ << ": delete failed
for tablet " << tablet_id_
+                       << " with error code " << TabletServerErrorPB::Code_Name(code)
                        << ": " << status.ToString();
           break;
       }

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/bf1f244f/src/kudu/tserver/ts_tablet_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/ts_tablet_manager.cc b/src/kudu/tserver/ts_tablet_manager.cc
index 7127ea2..dc1ca16 100644
--- a/src/kudu/tserver/ts_tablet_manager.cc
+++ b/src/kudu/tserver/ts_tablet_manager.cc
@@ -494,12 +494,17 @@ Status TSTabletManager::DeleteTablet(
     }
   }
 
+  // If the tablet is already deleted, the CAS check isn't possible because
+  // consensus and therefore the log is not available.
+  TabletDataState data_state = tablet_peer->tablet_metadata()->tablet_data_state();
+  bool tablet_deleted = (data_state == TABLET_DATA_DELETED || data_state == TABLET_DATA_TOMBSTONED);
+
   // They specified an "atomic" delete. Check the committed config's opid_index.
   // TODO: There's actually a race here between the check and shutdown, but
   // it's tricky to fix. We could try checking again after the shutdown and
   // restarting the tablet if the local replica committed a higher config
   // change op during that time, or potentially something else more invasive.
-  if (cas_config_opid_index_less_or_equal) {
+  if (cas_config_opid_index_less_or_equal && !tablet_deleted) {
     scoped_refptr<consensus::Consensus> consensus = tablet_peer->shared_consensus();
     if (!consensus) {
       *error_code = TabletServerErrorPB::TABLET_NOT_RUNNING;


Mime
View raw message