kudu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mpe...@apache.org
Subject [2/2] kudu git commit: tablet copy: Allow voting from crashed initial tablet copies
Date Wed, 06 Sep 2017 23:36:45 GMT
tablet copy: Allow voting from crashed initial tablet copies

The change in c35e4f0093dcdec625d1f647924d854c7bc9f3de missed an
important case, which is when the target tablet server in a tablet copy
operation crashes while in the middle of a tablet copy. In that case,
the 'tombstone_last_logged_opid' of 1.0 was not persisted in the
superblock. This manifested as a very flaky
TabletCopyITest.TestTabletCopyNewReplicaFailureCanVote test, which would
fail about 15% of the time.

With the change in this patch, that test now passes reliably:

http://dist-test.cloudera.org/job?job_id=mpercy.1504654266.31446

Change-Id: Idbc9cecac701c24ebe05759adbbb1fbe2b140506
Reviewed-on: http://gerrit.cloudera.org:8080/7961
Reviewed-by: Todd Lipcon <todd@apache.org>
Tested-by: Kudu Jenkins
Reviewed-by: Alexey Serbin <aserbin@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/4f41a3cb
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/4f41a3cb
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/4f41a3cb

Branch: refs/heads/master
Commit: 4f41a3cb68019f5a56764a126de5debffe5b58d6
Parents: 2211afc
Author: Mike Percy <mpercy@apache.org>
Authored: Tue Sep 5 12:39:55 2017 -0700
Committer: Mike Percy <mpercy@apache.org>
Committed: Wed Sep 6 23:35:51 2017 +0000

----------------------------------------------------------------------
 src/kudu/consensus/consensus-test-util.h        |   4 +-
 src/kudu/integration-tests/tablet_copy-itest.cc | 155 ++++++++++++++++---
 src/kudu/master/sys_catalog.cc                  |   3 +
 src/kudu/tablet/tablet-harness.h                |   1 +
 src/kudu/tablet/tablet_bootstrap-test.cc        |   2 +
 src/kudu/tablet/tablet_metadata.cc              |  17 +-
 src/kudu/tablet/tablet_metadata.h               |   9 +-
 src/kudu/tools/kudu-tool-test.cc                |   4 +-
 src/kudu/tserver/tablet_copy_client.cc          |  16 +-
 src/kudu/tserver/ts_tablet_manager.cc           |   1 +
 src/kudu/tserver/ts_tablet_manager.h            |   5 +
 11 files changed, 182 insertions(+), 35 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/4f41a3cb/src/kudu/consensus/consensus-test-util.h
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/consensus-test-util.h b/src/kudu/consensus/consensus-test-util.h
index c73e50b..409841b 100644
--- a/src/kudu/consensus/consensus-test-util.h
+++ b/src/kudu/consensus/consensus-test-util.h
@@ -48,8 +48,8 @@
 
 #define ASSERT_OPID_EQ(left, right) \
   do { \
-    const OpId& TOKENPASTE2(_left, __LINE__) = (left); \
-    const OpId& TOKENPASTE2(_right, __LINE__) = (right); \
+    const consensus::OpId& TOKENPASTE2(_left, __LINE__) = (left); \
+    const consensus::OpId& TOKENPASTE2(_right, __LINE__) = (right); \
     if (!consensus::OpIdEquals(TOKENPASTE2(_left, __LINE__), TOKENPASTE2(_right, __LINE__)))
{ \
       FAIL() << "Expected: " \
             << pb_util::SecureShortDebugString(TOKENPASTE2(_left, __LINE__)) <<
"\n" \

http://git-wip-us.apache.org/repos/asf/kudu/blob/4f41a3cb/src/kudu/integration-tests/tablet_copy-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/tablet_copy-itest.cc b/src/kudu/integration-tests/tablet_copy-itest.cc
index e8be313..b151962 100644
--- a/src/kudu/integration-tests/tablet_copy-itest.cc
+++ b/src/kudu/integration-tests/tablet_copy-itest.cc
@@ -40,10 +40,12 @@
 #include "kudu/common/wire_protocol-test-util.h"
 #include "kudu/common/wire_protocol.h"
 #include "kudu/common/wire_protocol.pb.h"
+#include "kudu/consensus/consensus-test-util.h"
 #include "kudu/consensus/consensus.pb.h"
 #include "kudu/consensus/consensus_meta_manager.h"
 #include "kudu/consensus/metadata.pb.h"
 #include "kudu/consensus/opid.pb.h"
+#include "kudu/consensus/opid_util.h"
 #include "kudu/fs/fs_manager.h"
 #include "kudu/gutil/basictypes.h"
 #include "kudu/gutil/gscoped_ptr.h"
@@ -72,6 +74,7 @@
 #include "kudu/util/net/net_util.h"
 #include "kudu/util/net/sockaddr.h"
 #include "kudu/util/path_util.h"
+#include "kudu/util/pb_util.h"
 #include "kudu/util/status.h"
 #include "kudu/util/test_macros.h"
 #include "kudu/util/test_util.h"
@@ -93,12 +96,17 @@ using kudu::client::KuduTableCreator;
 using kudu::consensus::COMMITTED_OPID;
 using kudu::consensus::ConsensusMetadataManager;
 using kudu::consensus::ConsensusMetadataPB;
+using kudu::consensus::MakeOpId;
 using kudu::consensus::RaftPeerPB;
 using kudu::itest::TServerDetails;
 using kudu::itest::WaitForNumTabletServers;
+using kudu::pb_util::SecureShortDebugString;
 using kudu::tablet::TABLET_DATA_COPYING;
 using kudu::tablet::TABLET_DATA_DELETED;
+using kudu::tablet::TABLET_DATA_READY;
 using kudu::tablet::TABLET_DATA_TOMBSTONED;
+using kudu::tablet::TabletDataState;
+using kudu::tablet::TabletSuperBlockPB;
 using kudu::tserver::ListTabletsResponsePB;
 using kudu::tserver::ListTabletsResponsePB_StatusAndSchemaPB;
 using kudu::tserver::TabletCopyClient;
@@ -967,16 +975,16 @@ TEST_F(TabletCopyITest, TestTabletCopyClearsLastLoggedOpId) {
   workload.StopAndJoin();
   ASSERT_OK(WaitForServersToAgree(kTimeout, ts_map_, tablet_id, 1));
 
-  // No last-logged opid should initially be stored in the superblock.
+  // No last-logged opid should initially be stored in the superblock on a brand-new tablet.
   tablet::TabletSuperBlockPB superblock_pb;
-  inspect_->ReadTabletSuperBlockOnTS(leader_index, tablet_id, &superblock_pb);
+  ASSERT_OK(inspect_->ReadTabletSuperBlockOnTS(leader_index, tablet_id, &superblock_pb));
   ASSERT_FALSE(superblock_pb.has_tombstone_last_logged_opid());
 
   // Now tombstone the leader.
   ASSERT_OK(itest::DeleteTablet(leader, tablet_id, TABLET_DATA_TOMBSTONED, boost::none, kTimeout));
 
   // We should end up with a last-logged opid in the superblock.
-  inspect_->ReadTabletSuperBlockOnTS(leader_index, tablet_id, &superblock_pb);
+  ASSERT_OK(inspect_->ReadTabletSuperBlockOnTS(leader_index, tablet_id, &superblock_pb));
   ASSERT_TRUE(superblock_pb.has_tombstone_last_logged_opid());
   consensus::OpId last_logged_opid = superblock_pb.tombstone_last_logged_opid();
   ASSERT_EQ(1, last_logged_opid.term());
@@ -988,11 +996,15 @@ TEST_F(TabletCopyITest, TestTabletCopyClearsLastLoggedOpId) {
                                    cluster_->tablet_server(follower_index)->bound_rpc_hostport(),
                                    1, // We are in term 1.
                                    kTimeout));
-  ASSERT_OK(itest::WaitUntilTabletRunning(leader, tablet_id, kTimeout));
 
-  // Ensure that the last-logged opid has been cleared from the superblock.
-  inspect_->ReadTabletSuperBlockOnTS(leader_index, tablet_id, &superblock_pb);
-  ASSERT_FALSE(superblock_pb.has_tombstone_last_logged_opid());
+  ASSERT_EVENTUALLY([&] {
+    // Ensure that the last-logged opid has been cleared from the superblock
+    // once it persists the TABLET_DATA_READY state to disk.
+    ASSERT_OK(inspect_->ReadTabletSuperBlockOnTS(leader_index, tablet_id, &superblock_pb));
+    ASSERT_EQ(TABLET_DATA_READY, superblock_pb.tablet_data_state());
+    ASSERT_FALSE(superblock_pb.has_tombstone_last_logged_opid())
+        << SecureShortDebugString(superblock_pb);
+  });
 }
 
 // Test that the tablet copy thread pool being full results in throttling and
@@ -1097,13 +1109,24 @@ TEST_F(TabletCopyITest, TestTabletCopyThrottling) {
   LOG(INFO) << "Number of in progress responses: " << num_inprogress;
 }
 
+class TabletCopyFailureITest : public TabletCopyITest,
+                               public ::testing::WithParamInterface<const char*> {
+};
+// Trigger tablet copy failures a couple different ways: session timeout and
+// crash. We want all tablet copy attempts to fail after initial setup.
+const char* kTabletCopyFailureITestFlags[] = {
+  "--tablet_copy_early_session_timeout_prob=1.0",
+  "--tablet_copy_fault_crash_on_fetch_all=1.0"
+};
+INSTANTIATE_TEST_CASE_P(FailureCause, TabletCopyFailureITest,
+                        ::testing::ValuesIn(kTabletCopyFailureITestFlags));
+
 // Test that a failed tablet copy of a brand-new replica results in still being
 // able to vote while tombstoned.
-TEST_F(TabletCopyITest, TestTabletCopyNewReplicaFailureCanVote) {
+TEST_P(TabletCopyFailureITest, TestTabletCopyNewReplicaFailureCanVote) {
   const MonoDelta kTimeout = MonoDelta::FromSeconds(30);
-  // We want all tablet copy attempts to fail after initial setup.
-  NO_FATALS(StartCluster({"--tablet_copy_early_session_timeout_prob=1.0"}, {},
-                         /*num_tablet_servers=*/ 4));
+  const string& failure_flag = GetParam();
+  NO_FATALS(StartCluster({failure_flag}, {}, /*num_tablet_servers=*/ 4));
   TestWorkload workload(cluster_.get());
   workload.Setup();
 
@@ -1118,10 +1141,6 @@ TEST_F(TabletCopyITest, TestTabletCopyNewReplicaFailureCanVote) {
     replica_uuids.insert(replica.ts_info().permanent_uuid());
   }
 
-  TServerDetails* leader_ts;
-  ASSERT_OK(itest::FindTabletLeader(ts_map_, tablet_id, kTimeout, &leader_ts));
-  ASSERT_OK(itest::WaitForOpFromCurrentTerm(leader_ts, tablet_id, COMMITTED_OPID, kTimeout));
-
   string new_replica_uuid;
   for (int i = 0; i < cluster_->num_tablet_servers(); i++) {
     if (!ContainsKey(replica_uuids, cluster_->tablet_server(i)->uuid())) {
@@ -1132,9 +1151,16 @@ TEST_F(TabletCopyITest, TestTabletCopyNewReplicaFailureCanVote) {
   ASSERT_FALSE(new_replica_uuid.empty());
   auto* new_replica_ts = ts_map_[new_replica_uuid];
 
-  // Adding a server will trigger a tablet copy.
-  ASSERT_OK(itest::AddServer(leader_ts, tablet_id, new_replica_ts, RaftPeerPB::VOTER,
-                             boost::none, kTimeout));
+  // Allow retries in case the tablet leadership is unstable.
+  ASSERT_EVENTUALLY([&] {
+    TServerDetails* leader_ts;
+    ASSERT_OK(itest::FindTabletLeader(ts_map_, tablet_id, kTimeout, &leader_ts));
+    ASSERT_OK(itest::WaitForOpFromCurrentTerm(leader_ts, tablet_id, COMMITTED_OPID, kTimeout));
+
+    // Adding a server will trigger a tablet copy.
+    ASSERT_OK(itest::AddServer(leader_ts, tablet_id, new_replica_ts, RaftPeerPB::VOTER,
+                              boost::none, kTimeout));
+  });
 
   // Wait until the new replica has done its initial download, and has either
   // failed or is about to fail (the check is nondeterministic) plus has
@@ -1155,6 +1181,16 @@ TEST_F(TabletCopyITest, TestTabletCopyNewReplicaFailureCanVote) {
     // Remove the "early timeout" flag.
     cluster_->tablet_server(i)->mutable_flags()->pop_back();
   }
+
+  TabletSuperBlockPB superblock;
+  ASSERT_OK(inspect_->ReadTabletSuperBlockOnTS(new_replica_idx, tablet_id, &superblock));
+  LOG(INFO) << "Restarting tablet servers. New replica TS UUID: " << new_replica_uuid
+            << ", tablet data state: "
+            << tablet::TabletDataState_Name(superblock.tablet_data_state())
+            << ", last-logged opid: " << superblock.tombstone_last_logged_opid();
+  ASSERT_TRUE(superblock.has_tombstone_last_logged_opid());
+  ASSERT_OPID_EQ(MakeOpId(1, 0), superblock.tombstone_last_logged_opid());
+
   ASSERT_OK(cluster_->master()->Restart());
   ASSERT_OK(cluster_->tablet_server_by_uuid(new_replica_uuid)->Restart());
   auto iter = replica_uuids.cbegin();
@@ -1169,6 +1205,85 @@ TEST_F(TabletCopyITest, TestTabletCopyNewReplicaFailureCanVote) {
   workload.StopAndJoin();
 }
 
+// Test that a failed tablet copy over a tombstoned replica retains the
+// last-logged OpId from the tombstoned replica.
+TEST_P(TabletCopyFailureITest, TestFailedTabletCopyRetainsLastLoggedOpId) {
+  MonoDelta kTimeout = MonoDelta::FromSeconds(30);
+  const string& tablet_copy_failure_flag = GetParam();
+  NO_FATALS(StartCluster({"--enable_leader_failure_detection=false",
+                          tablet_copy_failure_flag},
+                         {"--catalog_manager_wait_for_new_tablets_to_elect_leader=false"}));
+
+  TestWorkload workload(cluster_.get());
+  workload.Setup();
+
+  int first_leader_index = 0;
+  TServerDetails* first_leader = ts_map_[cluster_->tablet_server(first_leader_index)->uuid()];
+
+  // Figure out the tablet id of the created tablet.
+  vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets;
+  ASSERT_OK(WaitForNumTabletsOnTS(first_leader, 1, kTimeout, &tablets));
+  string tablet_id = tablets[0].tablet_status().tablet_id();
+
+  // Wait until all replicas are up and running.
+  for (int i = 0; i < cluster_->num_tablet_servers(); i++) {
+    ASSERT_OK(itest::WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()],
+                                            tablet_id, kTimeout));
+  }
+
+  // Elect a leader for term 1, then generate some data to copy.
+  ASSERT_OK(itest::StartElection(first_leader, tablet_id, kTimeout));
+  workload.Start();
+  const int kMinBatches = 10;
+  ASSERT_EVENTUALLY([&] {
+    ASSERT_GE(workload.batches_completed(), kMinBatches);
+  });
+  workload.StopAndJoin();
+  ASSERT_OK(WaitForServersToAgree(kTimeout, ts_map_, tablet_id, 1));
+
+  // Tombstone the first leader.
+  ASSERT_OK(itest::DeleteTablet(first_leader, tablet_id, TABLET_DATA_TOMBSTONED, boost::none,
+                                kTimeout));
+
+  // We should end up with a last-logged opid in the superblock of the first leader.
+  tablet::TabletSuperBlockPB superblock_pb;
+  ASSERT_OK(inspect_->ReadTabletSuperBlockOnTS(first_leader_index, tablet_id, &superblock_pb));
+  ASSERT_TRUE(superblock_pb.has_tombstone_last_logged_opid());
+  consensus::OpId last_logged_opid = superblock_pb.tombstone_last_logged_opid();
+  ASSERT_EQ(1, last_logged_opid.term());
+  ASSERT_GT(last_logged_opid.index(), kMinBatches);
+  int64_t initial_mtime = inspect_->GetTabletSuperBlockMTimeOrDie(first_leader_index,
tablet_id);
+
+  // Elect a new leader.
+  int second_leader_index = 1;
+  TServerDetails* second_leader = ts_map_[cluster_->tablet_server(second_leader_index)->uuid()];
+  ASSERT_OK(itest::StartElection(second_leader, tablet_id, kTimeout));
+
+  // The second leader will initiate a tablet copy on the first leader. Wait
+  // for it to fail (via crash or abort).
+  ASSERT_EVENTUALLY([&] {
+    // Superblock must have been modified.
+    int64_t cur_mtime = inspect_->GetTabletSuperBlockMTimeOrDie(first_leader_index, tablet_id);
+    ASSERT_GT(cur_mtime, initial_mtime);
+    ASSERT_OK(inspect_->CheckTabletDataStateOnTS(first_leader_index, tablet_id,
+                                                 { TABLET_DATA_COPYING, TABLET_DATA_TOMBSTONED
}));
+  });
+
+  // Bounce the cluster to get rid of leaders and reach a steady state.
+  cluster_->Shutdown();
+  ASSERT_OK(cluster_->Restart());
+
+  // After failing the tablet copy, we should retain the old tombstoned
+  // tablet's last-logged opid.
+  ASSERT_EVENTUALLY([&] {
+    ASSERT_OK(inspect_->ReadTabletSuperBlockOnTS(first_leader_index, tablet_id, &superblock_pb));
+    ASSERT_EQ(TABLET_DATA_TOMBSTONED, superblock_pb.tablet_data_state());
+    ASSERT_TRUE(superblock_pb.has_tombstone_last_logged_opid())
+        << SecureShortDebugString(superblock_pb);
+    ASSERT_OPID_EQ(last_logged_opid, superblock_pb.tombstone_last_logged_opid());
+  });
+}
+
 // This test uses CountBlocksUnderManagement() which only works with the
 // LogBlockManager.
 #ifndef __APPLE__
@@ -1185,9 +1300,9 @@ class BadTabletCopyITest : public TabletCopyITest,
 // server.
 const char* kFlagFaultOnFetch = "fault_crash_on_handle_tc_fetch_data";
 const char* kFlagEarlyTimeout = "tablet_copy_early_session_timeout_prob";
-const char* tablet_copy_failure_flags[] = { kFlagFaultOnFetch, kFlagEarlyTimeout };
+const char* kBadTabletCopyITestFlags[] = { kFlagFaultOnFetch, kFlagEarlyTimeout };
 INSTANTIATE_TEST_CASE_P(FaultFlags, BadTabletCopyITest,
-                        ::testing::ValuesIn(tablet_copy_failure_flags));
+                        ::testing::ValuesIn(kBadTabletCopyITestFlags));
 
 void BadTabletCopyITest::LoadTable(TestWorkload* workload, int min_rows, int min_blocks)
{
   const MonoDelta kTimeout = MonoDelta::FromSeconds(30);

http://git-wip-us.apache.org/repos/asf/kudu/blob/4f41a3cb/src/kudu/master/sys_catalog.cc
----------------------------------------------------------------------
diff --git a/src/kudu/master/sys_catalog.cc b/src/kudu/master/sys_catalog.cc
index b6593b8..eeb95e0 100644
--- a/src/kudu/master/sys_catalog.cc
+++ b/src/kudu/master/sys_catalog.cc
@@ -28,6 +28,7 @@
 #include <utility>
 #include <vector>
 
+#include <boost/optional/optional.hpp>
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <google/protobuf/util/message_differencer.h>
@@ -49,6 +50,7 @@
 #include "kudu/consensus/consensus_meta_manager.h"
 #include "kudu/consensus/consensus_peers.h"
 #include "kudu/consensus/log.h"
+#include "kudu/consensus/opid.pb.h"
 #include "kudu/consensus/opid_util.h"
 #include "kudu/consensus/quorum_util.h"
 #include "kudu/consensus/raft_consensus.h"
@@ -238,6 +240,7 @@ Status SysCatalogTable::CreateNew(FsManager *fs_manager) {
                                                   schema, partition_schema,
                                                   partitions[0],
                                                   tablet::TABLET_DATA_READY,
+                                                  /*tombstone_last_logged_opid=*/ boost::none,
                                                   &metadata));
 
   RaftConfigPB config;

http://git-wip-us.apache.org/repos/asf/kudu/blob/4f41a3cb/src/kudu/tablet/tablet-harness.h
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet-harness.h b/src/kudu/tablet/tablet-harness.h
index 5fc547c..eb62691 100644
--- a/src/kudu/tablet/tablet-harness.h
+++ b/src/kudu/tablet/tablet-harness.h
@@ -100,6 +100,7 @@ class TabletHarness {
                                                partition.first,
                                                partition.second,
                                                TABLET_DATA_READY,
+                                               /*tombstone_last_logged_opid=*/ boost::none,
                                                &metadata));
     if (options_.enable_metrics) {
       metrics_registry_.reset(new MetricRegistry());

http://git-wip-us.apache.org/repos/asf/kudu/blob/4f41a3cb/src/kudu/tablet/tablet_bootstrap-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet_bootstrap-test.cc b/src/kudu/tablet/tablet_bootstrap-test.cc
index e31574f..4af600a 100644
--- a/src/kudu/tablet/tablet_bootstrap-test.cc
+++ b/src/kudu/tablet/tablet_bootstrap-test.cc
@@ -27,6 +27,7 @@
 #include <utility>
 #include <vector>
 
+#include <boost/optional/optional.hpp>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
@@ -125,6 +126,7 @@ class BootstrapTest : public LogTestBase {
                                                partition.first,
                                                partition.second,
                                                TABLET_DATA_READY,
+                                               /*tombstone_last_logged_opid=*/ boost::none,
                                                meta));
     (*meta)->SetLastDurableMrsIdForTests(mrs_id);
     if ((*meta)->GetRowSetForTests(0) != nullptr) {

http://git-wip-us.apache.org/repos/asf/kudu/blob/4f41a3cb/src/kudu/tablet/tablet_metadata.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet_metadata.cc b/src/kudu/tablet/tablet_metadata.cc
index d5a271f..f00de4c 100644
--- a/src/kudu/tablet/tablet_metadata.cc
+++ b/src/kudu/tablet/tablet_metadata.cc
@@ -86,6 +86,7 @@ Status TabletMetadata::CreateNew(FsManager* fs_manager,
                                  const PartitionSchema& partition_schema,
                                  const Partition& partition,
                                  const TabletDataState& initial_tablet_data_state,
+                                 boost::optional<OpId> tombstone_last_logged_opid,
                                  scoped_refptr<TabletMetadata>* metadata) {
 
   // Verify that no existing tablet exists with the same ID.
@@ -105,7 +106,8 @@ Status TabletMetadata::CreateNew(FsManager* fs_manager,
                                                        schema,
                                                        partition_schema,
                                                        partition,
-                                                       initial_tablet_data_state));
+                                                       initial_tablet_data_state,
+                                                       std::move(tombstone_last_logged_opid)));
   RETURN_NOT_OK(ret->Flush());
   dir_group_cleanup.cancel();
 
@@ -130,6 +132,7 @@ Status TabletMetadata::LoadOrCreate(FsManager* fs_manager,
                                     const PartitionSchema& partition_schema,
                                     const Partition& partition,
                                     const TabletDataState& initial_tablet_data_state,
+                                    boost::optional<OpId> tombstone_last_logged_opid,
                                     scoped_refptr<TabletMetadata>* metadata) {
   Status s = Load(fs_manager, tablet_id, metadata);
   if (s.ok()) {
@@ -139,13 +142,13 @@ Status TabletMetadata::LoadOrCreate(FsManager* fs_manager,
         schema.ToString()));
     }
     return Status::OK();
-  } else if (s.IsNotFound()) {
+  }
+  if (s.IsNotFound()) {
     return CreateNew(fs_manager, tablet_id, table_name, table_id, schema,
                      partition_schema, partition, initial_tablet_data_state,
-                     metadata);
-  } else {
-    return s;
+                     std::move(tombstone_last_logged_opid), metadata);
   }
+  return s;
 }
 
 vector<BlockIdPB> TabletMetadata::CollectBlockIdPBs(const TabletSuperBlockPB& superblock)
{
@@ -266,7 +269,8 @@ TabletMetadata::TabletMetadata(FsManager* fs_manager, string tablet_id,
                                string table_name, string table_id,
                                const Schema& schema, PartitionSchema partition_schema,
                                Partition partition,
-                               const TabletDataState& tablet_data_state)
+                               const TabletDataState& tablet_data_state,
+                               boost::optional<OpId> tombstone_last_logged_opid)
     : state_(kNotWrittenYet),
       tablet_id_(std::move(tablet_id)),
       table_id_(std::move(table_id)),
@@ -279,6 +283,7 @@ TabletMetadata::TabletMetadata(FsManager* fs_manager, string tablet_id,
       table_name_(std::move(table_name)),
       partition_schema_(std::move(partition_schema)),
       tablet_data_state_(tablet_data_state),
+      tombstone_last_logged_opid_(std::move(tombstone_last_logged_opid)),
       num_flush_pins_(0),
       needs_flush_(false),
       pre_flush_callback_(Bind(DoNothingStatusClosure)) {

http://git-wip-us.apache.org/repos/asf/kudu/blob/4f41a3cb/src/kudu/tablet/tablet_metadata.h
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet_metadata.h b/src/kudu/tablet/tablet_metadata.h
index 9c34638..e9cde54 100644
--- a/src/kudu/tablet/tablet_metadata.h
+++ b/src/kudu/tablet/tablet_metadata.h
@@ -82,6 +82,7 @@ class TabletMetadata : public RefCountedThreadSafe<TabletMetadata>
{
                           const PartitionSchema& partition_schema,
                           const Partition& partition,
                           const TabletDataState& initial_tablet_data_state,
+                          boost::optional<consensus::OpId> tombstone_last_logged_opid,
                           scoped_refptr<TabletMetadata>* metadata);
 
   // Load existing metadata from disk.
@@ -102,6 +103,7 @@ class TabletMetadata : public RefCountedThreadSafe<TabletMetadata>
{
                              const PartitionSchema& partition_schema,
                              const Partition& partition,
                              const TabletDataState& initial_tablet_data_state,
+                             boost::optional<consensus::OpId> tombstone_last_logged_opid,
                              scoped_refptr<TabletMetadata>* metadata);
 
   static std::vector<BlockIdPB> CollectBlockIdPBs(
@@ -262,13 +264,14 @@ class TabletMetadata : public RefCountedThreadSafe<TabletMetadata>
{
 
   // Constructor for creating a new tablet.
   //
-  // TODO: get rid of this many-arg constructor in favor of just passing in a
-  // SuperBlock, which already contains all of these fields.
+  // TODO(todd): get rid of this many-arg constructor in favor of just passing
+  // in a SuperBlock, which already contains all of these fields.
   TabletMetadata(FsManager* fs_manager, std::string tablet_id,
                  std::string table_name, std::string table_id,
                  const Schema& schema, PartitionSchema partition_schema,
                  Partition partition,
-                 const TabletDataState& tablet_data_state);
+                 const TabletDataState& tablet_data_state,
+                 boost::optional<consensus::OpId> tombstone_last_logged_opid);
 
   // Constructor for loading an existing tablet.
   TabletMetadata(FsManager* fs_manager, std::string tablet_id);

http://git-wip-us.apache.org/repos/asf/kudu/blob/4f41a3cb/src/kudu/tools/kudu-tool-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/kudu-tool-test.cc b/src/kudu/tools/kudu-tool-test.cc
index b4171b0..62e7b6e 100644
--- a/src/kudu/tools/kudu-tool-test.cc
+++ b/src/kudu/tools/kudu-tool-test.cc
@@ -1021,7 +1021,9 @@ TEST_F(ToolTest, TestLocalReplicaDumpMeta) {
   scoped_refptr<TabletMetadata> meta;
   TabletMetadata::CreateNew(&fs, kTestTablet, kTestTableName, kTestTableId,
                   kSchemaWithIds, partition.first, partition.second,
-                  tablet::TABLET_DATA_READY, &meta);
+                  tablet::TABLET_DATA_READY,
+                  /*tombstone_last_logged_opid=*/ boost::none,
+                  &meta);
   string stdout;
   NO_FATALS(RunActionStdoutString(Substitute("local_replica dump meta $0 "
                                              "--fs_wal_dir=$1 "

http://git-wip-us.apache.org/repos/asf/kudu/blob/4f41a3cb/src/kudu/tserver/tablet_copy_client.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/tablet_copy_client.cc b/src/kudu/tserver/tablet_copy_client.cc
index af5cce7..506a840 100644
--- a/src/kudu/tserver/tablet_copy_client.cc
+++ b/src/kudu/tserver/tablet_copy_client.cc
@@ -301,11 +301,18 @@ Status TabletCopyClient::Start(const HostPort& copy_source_addr,
                      copy_peer_uuid));
     }
 
+    // Retain the last-logged OpId from the previous tombstone in case this
+    // tablet copy is aborted.
+    if (last_logged_opid) {
+      *superblock_->mutable_tombstone_last_logged_opid() = *last_logged_opid;
+    }
+
     // Remove any existing orphaned blocks and WALs from the tablet, and
     // set the data state to 'COPYING'.
     RETURN_NOT_OK_PREPEND(
         TSTabletManager::DeleteTabletData(meta_, cmeta_manager_,
-                                          tablet::TABLET_DATA_COPYING, boost::none),
+                                          tablet::TABLET_DATA_COPYING,
+                                          /*last_logged_opid=*/ boost::none),
         "Could not replace superblock with COPYING data state");
     CHECK_OK(fs_manager_->dd_manager()->CreateDataDirGroup(tablet_id_));
   } else {
@@ -327,7 +334,8 @@ Status TabletCopyClient::Start(const HostPort& copy_source_addr,
                                             schema,
                                             partition_schema,
                                             partition,
-                                            tablet::TABLET_DATA_COPYING,
+                                            superblock_->tablet_data_state(),
+                                            superblock_->tombstone_last_logged_opid(),
                                             &meta_));
   }
   CHECK(fs_manager_->dd_manager()->GetDataDirGroupPB(tablet_id_,
@@ -369,6 +377,7 @@ Status TabletCopyClient::Finish() {
   LOG_WITH_PREFIX(INFO) << "Tablet Copy complete. Replacing tablet superblock.";
   SetStatusMessage("Replacing tablet superblock");
   superblock_->set_tablet_data_state(tablet::TABLET_DATA_READY);
+  superblock_->clear_tombstone_last_logged_opid();
   RETURN_NOT_OK(meta_->ReplaceSuperBlock(*superblock_));
 
   if (FLAGS_tablet_copy_save_downloaded_metadata) {
@@ -397,7 +406,8 @@ Status TabletCopyClient::Abort() {
   // Delete all of the tablet data, including blocks and WALs.
   RETURN_NOT_OK_PREPEND(
       TSTabletManager::DeleteTabletData(meta_, cmeta_manager_,
-                                        tablet::TABLET_DATA_TOMBSTONED, boost::none),
+                                        tablet::TABLET_DATA_TOMBSTONED,
+                                        /*last_logged_opid=*/ boost::none),
       LogPrefix() + "Failed to tombstone tablet after aborting tablet copy");
 
   SetStatusMessage(Substitute("Tombstoned tablet $0: Tablet copy aborted", tablet_id_));

http://git-wip-us.apache.org/repos/asf/kudu/blob/4f41a3cb/src/kudu/tserver/ts_tablet_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/ts_tablet_manager.cc b/src/kudu/tserver/ts_tablet_manager.cc
index 85055fd..8e23e61 100644
--- a/src/kudu/tserver/ts_tablet_manager.cc
+++ b/src/kudu/tserver/ts_tablet_manager.cc
@@ -290,6 +290,7 @@ Status TSTabletManager::CreateNewTablet(const string& table_id,
                               partition_schema,
                               partition,
                               TABLET_DATA_READY,
+                              boost::none,
                               &meta),
     "Couldn't create tablet metadata");
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/4f41a3cb/src/kudu/tserver/ts_tablet_manager.h
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/ts_tablet_manager.h b/src/kudu/tserver/ts_tablet_manager.h
index 64897dc..f2a0795 100644
--- a/src/kudu/tserver/ts_tablet_manager.h
+++ b/src/kudu/tserver/ts_tablet_manager.h
@@ -185,6 +185,11 @@ class TSTabletManager : public tserver::TabletReplicaLookupIf {
 
   // Delete the tablet using the specified delete_type as the final metadata
   // state. Deletes the on-disk data, metadata, as well as all WAL segments.
+  //
+  // If set, 'last_logged_opid' will be persisted in the
+  // 'tombstone_last_logged_opid' field in the tablet metadata. Otherwise, if
+  // 'last_logged_opid' is equal to boost::none, the tablet metadata will
+  // retain its previous value of 'tombstone_last_logged_opid', if any.
   static Status DeleteTabletData(
       const scoped_refptr<tablet::TabletMetadata>& meta,
       const scoped_refptr<consensus::ConsensusMetadataManager>& cmeta_manager,


Mime
View raw message