kudu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From t...@apache.org
Subject [2/4] kudu git commit: [tools] Address potential flakiness of TestMoveTablet
Date Tue, 01 Aug 2017 21:10:48 GMT
[tools] Address potential flakiness of TestMoveTablet

There's a few potential sources of flakiness in TestMoveTablet
and in the move tablet tool itself:
1. Leader step down may elect the same leader over and over,
preventing it from being removed from the config
2. With ongoing writes, the replicas may not all agree even though
they are all making progress
3. Sometimes leader changes, etc time out

This patch addresses these problems:
1. After step down, the former leader will snooze its failure
detector for 2x a normal election timeout
2. The tool waits for the new replica to see the new config
instead of for all servers to agree (since that's what we were
really waiting for anyway)
3. Timeouts have been bumped up to 30s, which is the norm for
similar tests.

Change-Id: Ic3aa5d816b403818f69baa71cf40b35b82ff9096
Reviewed-on: http://gerrit.cloudera.org:8080/7533
Tested-by: Kudu Jenkins
Reviewed-by: Todd Lipcon <todd@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/af343140
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/af343140
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/af343140

Branch: refs/heads/master
Commit: af343140b081bf5b324198c2c840d0da4c7ffb40
Parents: ed827e0
Author: Will Berkeley <wdberkeley@apache.org>
Authored: Fri Jul 28 08:18:32 2017 -0700
Committer: Todd Lipcon <todd@apache.org>
Committed: Tue Aug 1 21:00:11 2017 +0000

----------------------------------------------------------------------
 src/kudu/consensus/raft_consensus.cc |  5 +++++
 src/kudu/tools/kudu-admin-test.cc    |  5 +++--
 src/kudu/tools/tool_action_tablet.cc | 12 ++++++------
 3 files changed, 14 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/af343140/src/kudu/consensus/raft_consensus.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/raft_consensus.cc b/src/kudu/consensus/raft_consensus.cc
index 1ebd0ef..cd0b9dc 100644
--- a/src/kudu/consensus/raft_consensus.cc
+++ b/src/kudu/consensus/raft_consensus.cc
@@ -503,6 +503,11 @@ Status RaftConsensus::StepDown(LeaderStepDownResponsePB* resp) {
     return Status::OK();
   }
   RETURN_NOT_OK(BecomeReplicaUnlocked());
+
+  // Snooze the failure detector for an extra leader failure timeout.
+  // This should ensure that a different replica is elected leader after this one steps down.
+  WARN_NOT_OK(SnoozeFailureDetector(MinimumElectionTimeout(), ALLOW_LOGGING),
+              "unable to snooze failure detector after stepping down");
   return Status::OK();
 }
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/af343140/src/kudu/tools/kudu-admin-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/kudu-admin-test.cc b/src/kudu/tools/kudu-admin-test.cc
index 379959d..7237548 100644
--- a/src/kudu/tools/kudu-admin-test.cc
+++ b/src/kudu/tools/kudu-admin-test.cc
@@ -51,6 +51,7 @@ using itest::TServerDetails;
 using itest::WAIT_FOR_LEADER;
 using itest::WaitForReplicasReportedToMaster;
 using itest::WaitForServersToAgree;
+using itest::WaitUntilCommittedConfigNumVotersIs;
 using itest::WaitUntilCommittedOpIdIndexIs;
 using itest::WaitUntilTabletInState;
 using itest::WaitUntilTabletRunning;
@@ -236,8 +237,8 @@ TEST_F(AdminCliTest, TestMoveTablet) {
     for (const string& uuid : active_tservers) {
       InsertOrDie(&active_tservers_map, uuid, tablet_servers_[uuid]);
     }
-    ASSERT_OK(WaitForServersToAgree(MonoDelta::FromSeconds(10), active_tservers_map,
-                                    tablet_id_, 1));
+    ASSERT_OK(WaitUntilCommittedConfigNumVotersIs(FLAGS_num_replicas, active_tservers_map[add],
+                                                  tablet_id_, MonoDelta::FromSeconds(30)));
   }
   workload.StopAndJoin();
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/af343140/src/kudu/tools/tool_action_tablet.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/tool_action_tablet.cc b/src/kudu/tools/tool_action_tablet.cc
index fefc43a..5eba3f6 100644
--- a/src/kudu/tools/tool_action_tablet.cc
+++ b/src/kudu/tools/tool_action_tablet.cc
@@ -45,7 +45,7 @@
 
 DEFINE_int64(move_copy_timeout_sec, 600,
              "Number of seconds to wait for tablet copy to complete when relocating a tablet");
-DEFINE_int64(move_leader_timeout_sec, 10,
+DEFINE_int64(move_leader_timeout_sec, 30,
              "Number of seconds to wait for a leader when relocating a leader tablet");
 
 namespace kudu {
@@ -365,13 +365,13 @@ Status MoveReplica(const RunnerContext &context) {
   const string& master_addresses_str = FindOrDie(context.required_args, kMasterAddressesArg);
   vector<string> master_addresses = strings::Split(master_addresses_str, ",");
   const string& tablet_id = FindOrDie(context.required_args, kTabletIdArg);
-  const string& rem_replica_uuid = FindOrDie(context.required_args, kFromTsUuidArg);
-  const string& add_replica_uuid = FindOrDie(context.required_args, kToTsUuidArg);
+  const string& from_ts_uuid = FindOrDie(context.required_args, kFromTsUuidArg);
+  const string& to_ts_uuid = FindOrDie(context.required_args, kToTsUuidArg);
 
   // Check the tablet is in perfect health and, if so, add the new server.
   RETURN_NOT_OK_PREPEND(DoKsckForTablet(master_addresses, tablet_id),
                         "ksck pre-move health check failed");
-  RETURN_NOT_OK(DoChangeConfig(master_addresses, tablet_id, add_replica_uuid,
+  RETURN_NOT_OK(DoChangeConfig(master_addresses, tablet_id, to_ts_uuid,
                                RaftPeerPB::VOTER, consensus::ADD_SERVER));
 
   // Wait until the tablet copy completes and the tablet returns to perfect health.
@@ -386,13 +386,13 @@ Status MoveReplica(const RunnerContext &context) {
   string leader_uuid;
   HostPort leader_hp;
   RETURN_NOT_OK(GetTabletLeader(client, tablet_id, &leader_uuid, &leader_hp));
-  if (rem_replica_uuid == leader_uuid) {
+  if (from_ts_uuid == leader_uuid) {
     RETURN_NOT_OK_PREPEND(ChangeLeader(client, tablet_id,
                                        leader_uuid, leader_hp,
                                        MonoDelta::FromSeconds(FLAGS_move_leader_timeout_sec)),
                           "failed changing leadership from the replica to be removed");
   }
-  return DoChangeConfig(master_addresses, tablet_id, rem_replica_uuid,
+  return DoChangeConfig(master_addresses, tablet_id, from_ts_uuid,
                         boost::none, consensus::REMOVE_SERVER);
 }
 


Mime
View raw message