kudu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From t...@apache.org
Subject [2/2] incubator-kudu git commit: Don't crash TS if consensus metadata is corrupted
Date Mon, 09 May 2016 21:18:51 GMT
Don't crash TS if consensus metadata is corrupted

If the consensus metadata somehow gets corrupted with a too-early term, the TS
should not crash with a CHECK failure. Instead, it should just mark that tablet
as FAILED.

Currently, the leader does not auto-evict a FAILED replica. But, the administrator
can use the CLI tools to delete the bad replica, which should cause it to get
automatically repaired.

This fix is based on an issue encountered in Bruce Song Zhang's cluster. His
cluster had been affected by KUDU-1436, which caused tablets on many servers to
have incorrect consensus metadata. Because of the CHECK that was in place, he
was unable to restart and recover those servers, causing an outage. With this
patch in place, only the affected tablets would have been affected, and
assuming a majority of replicas were still available, the table availability
would not have been compromised.

Change-Id: If9f85c1ce31a32e89e57c74e9750e66073b9752c
Reviewed-on: http://gerrit.cloudera.org:8080/3006
Reviewed-by: Adar Dembo <adar@cloudera.com>
Tested-by: Kudu Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-kudu/commit/5d82edb2
Tree: http://git-wip-us.apache.org/repos/asf/incubator-kudu/tree/5d82edb2
Diff: http://git-wip-us.apache.org/repos/asf/incubator-kudu/diff/5d82edb2

Branch: refs/heads/master
Commit: 5d82edb2ebe01a3ea73d851b80f36cc61c4002ae
Parents: eabcdab
Author: Todd Lipcon <todd@apache.org>
Authored: Mon May 9 11:05:47 2016 -0700
Committer: Todd Lipcon <todd@apache.org>
Committed: Mon May 9 21:17:15 2016 +0000

----------------------------------------------------------------------
 src/kudu/consensus/raft_consensus_state.cc      | 12 ++++--
 .../external_mini_cluster_fs_inspector.cc       | 27 ++++++++++---
 .../external_mini_cluster_fs_inspector.h        |  7 ++++
 .../integration-tests/raft_consensus-itest.cc   | 40 ++++++++++++++++++++
 4 files changed, 76 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/5d82edb2/src/kudu/consensus/raft_consensus_state.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/raft_consensus_state.cc b/src/kudu/consensus/raft_consensus_state.cc
index c607b00..4a70efb 100644
--- a/src/kudu/consensus/raft_consensus_state.cc
+++ b/src/kudu/consensus/raft_consensus_state.cc
@@ -60,10 +60,14 @@ Status ReplicaState::StartUnlocked(const OpId& last_id_in_wal) {
 
   // Our last persisted term can be higher than the last persisted operation
   // (i.e. if we called an election) but reverse should never happen.
-  CHECK_LE(last_id_in_wal.term(), GetCurrentTermUnlocked()) << LogPrefixUnlocked()
-      << "The last op in the WAL with id " << OpIdToString(last_id_in_wal)
-      << " has a term (" << last_id_in_wal.term() << ") that is greater
"
-      << "than the latest recorded term, which is " << GetCurrentTermUnlocked();
+  if (last_id_in_wal.term() > GetCurrentTermUnlocked()) {
+    return Status::Corruption(Substitute(
+        "The last op in the WAL with id $0 has a term ($1) that is greater "
+        "than the latest recorded term, which is $2",
+        OpIdToString(last_id_in_wal),
+        last_id_in_wal.term(),
+        GetCurrentTermUnlocked()));
+  }
 
   next_index_ = last_id_in_wal.index() + 1;
   last_received_op_id_.CopyFrom(last_id_in_wal);

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/5d82edb2/src/kudu/integration-tests/external_mini_cluster_fs_inspector.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/external_mini_cluster_fs_inspector.cc b/src/kudu/integration-tests/external_mini_cluster_fs_inspector.cc
index a350f55..c255463 100644
--- a/src/kudu/integration-tests/external_mini_cluster_fs_inspector.cc
+++ b/src/kudu/integration-tests/external_mini_cluster_fs_inspector.cc
@@ -169,18 +169,33 @@ Status ExternalMiniClusterFsInspector::ReadTabletSuperBlockOnTS(int
index,
   return pb_util::ReadPBContainerFromPath(env_, superblock_path, sb);
 }
 
+string ExternalMiniClusterFsInspector::GetConsensusMetadataPathOnTS(int index,
+                                                                    const string& tablet_id)
const {
+  string data_dir = cluster_->tablet_server(index)->data_dir();
+  string cmeta_dir = JoinPathSegments(data_dir, FsManager::kConsensusMetadataDirName);
+  return JoinPathSegments(cmeta_dir, tablet_id);
+}
+
 Status ExternalMiniClusterFsInspector::ReadConsensusMetadataOnTS(int index,
                                                                  const string& tablet_id,
                                                                  ConsensusMetadataPB* cmeta_pb)
{
-  string data_dir = cluster_->tablet_server(index)->data_dir();
-  string cmeta_dir = JoinPathSegments(data_dir, FsManager::kConsensusMetadataDirName);
-  string cmeta_file = JoinPathSegments(cmeta_dir, tablet_id);
-  if (!env_->FileExists(cmeta_file)) {
-    return Status::NotFound("Consensus metadata file not found", cmeta_file);
+  auto cmeta_path = GetConsensusMetadataPathOnTS(index, tablet_id);
+  if (!env_->FileExists(cmeta_path)) {
+    return Status::NotFound("Consensus metadata file not found", cmeta_path);
   }
-  return pb_util::ReadPBContainerFromPath(env_, cmeta_file, cmeta_pb);
+  return pb_util::ReadPBContainerFromPath(env_, cmeta_path, cmeta_pb);
 }
 
+Status ExternalMiniClusterFsInspector::WriteConsensusMetadataOnTS(
+    int index,
+    const string& tablet_id,
+    const ConsensusMetadataPB& cmeta_pb) {
+  auto cmeta_path = GetConsensusMetadataPathOnTS(index, tablet_id);
+  return pb_util::WritePBContainerToPath(env_, cmeta_path, cmeta_pb,
+                                         pb_util::OVERWRITE, pb_util::NO_SYNC);
+}
+
+
 Status ExternalMiniClusterFsInspector::CheckTabletDataStateOnTS(
     int index,
     const string& tablet_id,

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/5d82edb2/src/kudu/integration-tests/external_mini_cluster_fs_inspector.h
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/external_mini_cluster_fs_inspector.h b/src/kudu/integration-tests/external_mini_cluster_fs_inspector.h
index c4f21ad..600e30d 100644
--- a/src/kudu/integration-tests/external_mini_cluster_fs_inspector.h
+++ b/src/kudu/integration-tests/external_mini_cluster_fs_inspector.h
@@ -75,6 +75,10 @@ class ExternalMiniClusterFsInspector {
                                   tablet::TabletSuperBlockPB* sb);
   Status ReadConsensusMetadataOnTS(int index, const std::string& tablet_id,
                                    consensus::ConsensusMetadataPB* cmeta_pb);
+  Status WriteConsensusMetadataOnTS(int index,
+                                    const std::string& tablet_id,
+                                    const consensus::ConsensusMetadataPB& cmeta_pb);
+
   Status CheckTabletDataStateOnTS(int index,
                                   const std::string& tablet_id,
                                   const std::vector<tablet::TabletDataState>& expected_states);
@@ -106,6 +110,9 @@ class ExternalMiniClusterFsInspector {
       const MonoDelta& timeout = MonoDelta::FromSeconds(30));
 
  private:
+  std::string GetConsensusMetadataPathOnTS(int index,
+                                           const std::string& tablet_id) const;
+
   Env* const env_;
   ExternalMiniCluster* const cluster_;
 

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/5d82edb2/src/kudu/integration-tests/raft_consensus-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/raft_consensus-itest.cc b/src/kudu/integration-tests/raft_consensus-itest.cc
index ab72b52..62f8f82 100644
--- a/src/kudu/integration-tests/raft_consensus-itest.cc
+++ b/src/kudu/integration-tests/raft_consensus-itest.cc
@@ -2471,6 +2471,46 @@ TEST_F(RaftConsensusITest, TestUpdateConsensusErrorNonePrepared) {
   ASSERT_STR_CONTAINS(resp.ShortDebugString(), "Could not prepare a single transaction");
 }
 
+// Test that, if the raft metadata on a replica is corrupt, then the server
+// doesn't crash, but instead just marks the tablet as corrupt.
+TEST_F(RaftConsensusITest, TestCorruptReplicaMetadata) {
+  // Start cluster and wait until we have a stable leader.
+  BuildAndStart({}, {});
+  ASSERT_OK(WaitForServersToAgree(MonoDelta::FromSeconds(10), tablet_servers_,
+                                  tablet_id_, 1));
+
+
+  // Shut down one of the tablet servers, and then muck
+  // with its consensus metadata to corrupt it.
+  auto* ts = cluster_->tablet_server(0);
+  ts->Shutdown();
+  consensus::ConsensusMetadataPB cmeta_pb;
+  ASSERT_OK(inspect_->ReadConsensusMetadataOnTS(0, tablet_id_, &cmeta_pb));
+  cmeta_pb.set_current_term(cmeta_pb.current_term() - 1);
+  ASSERT_OK(inspect_->WriteConsensusMetadataOnTS(0, tablet_id_, cmeta_pb));
+
+  ASSERT_OK(ts->Restart());
+
+  // The server should come up with a 'FAILED' status because of the corrupt
+  // metadata.
+  ASSERT_OK(WaitUntilTabletInState(tablet_servers_[ts->uuid()],
+                                   tablet_id_,
+                                   tablet::FAILED,
+                                   MonoDelta::FromSeconds(30)));
+
+  // Currently, the tablet server does not automatically delete FAILED replicas.
+  // So, manually delete the bad replica in order to recover.
+  ASSERT_OK(itest::DeleteTablet(tablet_servers_[ts->uuid()], tablet_id_,
+                                tablet::TABLET_DATA_TOMBSTONED, boost::none,
+                                MonoDelta::FromSeconds(30)));
+
+  // A new good copy should get created.
+  ASSERT_OK(WaitUntilTabletInState(tablet_servers_[ts->uuid()],
+                                   tablet_id_,
+                                   tablet::RUNNING,
+                                   MonoDelta::FromSeconds(30)));
+}
+
 }  // namespace tserver
 }  // namespace kudu
 


Mime
View raw message