kudu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mpe...@apache.org
Subject [1/2] kudu git commit: [itests] additional test for 3-4-3 replication
Date Sat, 24 Mar 2018 02:26:38 GMT
Repository: kudu
Updated Branches:
  refs/heads/branch-1.7.x 3d113b9b3 -> 310dbe053


[itests] additional test for 3-4-3 replication

Added a new test for the 3-4-3 replication scheme to make sure the
system is able to replace a fallen-behind-WAL-segment-GC-threshold
replica 'in-place'.  For example, in case of the replication factor
of 3, the system should be able to recover in a cluster with just
3 tablet servers if a tablet replica falls behind the WAL segment GC.

Change-Id: Ieb1be6d1df751affc0fcfca5f2069eaad5888606
Reviewed-on: http://gerrit.cloudera.org:8080/9754
Tested-by: Kudu Jenkins
Reviewed-by: Mike Percy <mpercy@apache.org>
(cherry picked from commit c73f023da32015bd2d51d50099a2737cd749ceb2)
Reviewed-on: http://gerrit.cloudera.org:8080/9785
Tested-by: Alexey Serbin <aserbin@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/b5e990bf
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/b5e990bf
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/b5e990bf

Branch: refs/heads/branch-1.7.x
Commit: b5e990bf16068ff09389ac5454de724dc0e79e09
Parents: 3d113b9
Author: Alexey Serbin <aserbin@cloudera.com>
Authored: Wed Mar 21 20:16:13 2018 -0700
Committer: Alexey Serbin <aserbin@cloudera.com>
Committed: Sat Mar 24 01:18:46 2018 +0000

----------------------------------------------------------------------
 .../raft_consensus-itest-base.cc                |  50 +++++--
 .../raft_consensus-itest-base.h                 |  33 +++--
 .../raft_consensus_nonvoter-itest.cc            | 132 ++++++++++++++++++-
 3 files changed, 191 insertions(+), 24 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/b5e990bf/src/kudu/integration-tests/raft_consensus-itest-base.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/raft_consensus-itest-base.cc b/src/kudu/integration-tests/raft_consensus-itest-base.cc
index f10b652..aab6981 100644
--- a/src/kudu/integration-tests/raft_consensus-itest-base.cc
+++ b/src/kudu/integration-tests/raft_consensus-itest-base.cc
@@ -39,6 +39,7 @@
 #include "kudu/consensus/consensus.pb.h"
 #include "kudu/consensus/opid.pb.h"
 #include "kudu/gutil/gscoped_ptr.h"
+#include "kudu/gutil/macros.h"
 #include "kudu/gutil/stringprintf.h"
 #include "kudu/integration-tests/cluster_itest_util.h"
 #include "kudu/integration-tests/mini_cluster_fs_inspector.h"
@@ -56,6 +57,8 @@
 #include "kudu/util/slice.h"
 #include "kudu/util/status.h"
 #include "kudu/util/test_macros.h"
+// The IWYU confuses BehindWalGcBehavior::SHUTDOWN with TabletStatePB::SHUTDOWN.
+// IWYU pragma: no_include "kudu/tablet/metadata.pb.h"
 
 DEFINE_int32(num_client_threads, 8,
              "Number of client threads to launch");
@@ -193,17 +196,30 @@ void RaftConsensusITestBase::CauseFollowerToFallBehindLogGC(
     const itest::TabletServerMap& tablet_servers,
     string* leader_uuid,
     int64_t* orig_term,
-    string* fell_behind_uuid) {
+    string* fell_behind_uuid,
+    BehindWalGcBehavior tserver_behavior) {
   MonoDelta kTimeout = MonoDelta::FromSeconds(10);
   // Wait for all of the replicas to have acknowledged the elected
   // leader and logged the first NO_OP.
   ASSERT_OK(WaitForServersToAgree(kTimeout, tablet_servers, tablet_id_, 1));
 
-  // Pause one server. This might be the leader, but pausing it will cause
-  // a leader election to happen.
+  // Pause or shutdown one server. This might be the leader, and making it
+  // unresponsive will cause a leader election to happen.
   TServerDetails* replica = (*tablet_replicas_.begin()).second;
   ExternalTabletServer* replica_ets = cluster_->tablet_server_by_uuid(replica->uuid());
-  ASSERT_OK(replica_ets->Pause());
+  switch (tserver_behavior) {
+    case BehindWalGcBehavior::STOP_CONTINUE:
+      ASSERT_OK(replica_ets->Pause());
+      break;
+    case BehindWalGcBehavior::SHUTDOWN_RESTART: FALLTHROUGH_INTENDED;
+    case BehindWalGcBehavior::SHUTDOWN:
+      replica_ets->Shutdown();
+      break;
+    default:
+      CHECK(false) << tserver_behavior
+                   << ": unknown behavior for tserver behind WAL GC threshold";
+      break;
+  }
 
   // Find a leader. In case we paused the leader above, this will wait until
   // we have elected a new one.
@@ -257,16 +273,26 @@ void RaftConsensusITestBase::CauseFollowerToFallBehindLogGC(
     LOG(INFO) << "Servers converged with original term " << *orig_term;
   }
 
-  // Resume the follower.
-  LOG(INFO) << "Resuming  " << replica->uuid();
-  ASSERT_OK(replica_ets->Resume());
+  if (tserver_behavior == BehindWalGcBehavior::STOP_CONTINUE) {
+    // Resume the follower.
+    LOG(INFO) << "Resuming " << replica->uuid();
+    ASSERT_OK(replica_ets->Resume());
+  } else if (tserver_behavior == BehindWalGcBehavior::SHUTDOWN_RESTART) {
+    LOG(INFO) << "Restarting " << replica->uuid();
+    ASSERT_OK(replica_ets->Restart());
+  }
 
-  // Ensure that none of the tablet servers crashed.
+  // Make sure the involved servsers didn't crash.
   for (const auto& e: tablet_servers) {
-  //for (int i = 0; i < cluster_->num_tablet_servers(); i++) {
-    // Make sure the involved servsers didn't crash.
-    ASSERT_TRUE(cluster_->tablet_server_by_uuid(e.first)->IsProcessAlive())
-        << "Tablet server " << e.first << " crashed";
+    const auto& uuid = e.first;
+    if (tserver_behavior == BehindWalGcBehavior::SHUTDOWN &&
+        uuid == replica->uuid()) {
+      ASSERT_TRUE(cluster_->tablet_server_by_uuid(uuid)->IsShutdown())
+          << "Tablet server " << uuid << " is not shutdown";
+    } else {
+      ASSERT_TRUE(cluster_->tablet_server_by_uuid(uuid)->IsProcessAlive())
+          << "Tablet server " << uuid << " crashed";
+    }
   }
   *fell_behind_uuid = replica->uuid();
 }

http://git-wip-us.apache.org/repos/asf/kudu/blob/b5e990bf/src/kudu/integration-tests/raft_consensus-itest-base.h
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/raft_consensus-itest-base.h b/src/kudu/integration-tests/raft_consensus-itest-base.h
index 8785e0b..a4266ad 100644
--- a/src/kudu/integration-tests/raft_consensus-itest-base.h
+++ b/src/kudu/integration-tests/raft_consensus-itest-base.h
@@ -40,6 +40,13 @@ class TabletServerServiceProxy;
 // Uses the whole tablet server stack with ExternalMiniCluster.
 class RaftConsensusITestBase : public TabletServerIntegrationTestBase {
  public:
+  // Behavior for the tablet server hosting the fall-behind-WAL-GC replica.
+  enum BehindWalGcBehavior {
+    STOP_CONTINUE,    // Send SIGSTOP and then SIGCONT to the affected tserver.
+    SHUTDOWN_RESTART, // Shutdown and then restart the affected tserver.
+    SHUTDOWN,         // Shutdown the affected tserver, don't start it back up.
+  };
+
   RaftConsensusITestBase();
 
   void SetUp() override;
@@ -59,19 +66,25 @@ class RaftConsensusITestBase : public TabletServerIntegrationTestBase
{
   // Flags needed for CauseFollowerToFallBehindLogGC() to work well.
   static void AddFlagsForLogRolls(std::vector<std::string>* extra_tserver_flags);
 
-  // Pause one of the followers and write enough data to the remaining replicas
-  // to cause log GC, then resume the paused follower. On success,
-  // 'leader_uuid' will be set to the UUID of the leader, 'orig_term' will be
-  // set to the term of the leader before un-pausing the follower, and
-  // 'fell_behind_uuid' will be set to the UUID of the follower that was paused
-  // and caused to fall behind. These can be used for verification purposes.
+  // Pause/shutdown the tserver hosting one of the follower tablet replicas and
+  // write enough data to the remaining replicas to cause log GC. Then
+  // resume/restart/leave-shutdown the affected tserver. On success,
+  // 'leader_uuid' will be set to the UUID of the leader, 'orig_term'
+  // will be set to the term of the leader before un-pausing the follower and
+  // 'fell_behind_uuid' will be set to the UUID of the follower that was
+  // paused/shutdown and caused to fall behind. These can be used for
+  // verification purposes. The optional 'tserver_behavior' dictates what
+  // whether the affected tserver will be paused/resumed, shutdown/restarted,
+  // and so on.
   //
   // Certain flags should be set. You can add the required flags with
   // AddFlagsForLogRolls() before starting the cluster.
-  void CauseFollowerToFallBehindLogGC(const itest::TabletServerMap& tablet_servers,
-                                      std::string* leader_uuid,
-                                      int64_t* orig_term,
-                                      std::string* fell_behind_uuid);
+  void CauseFollowerToFallBehindLogGC(
+      const itest::TabletServerMap& tablet_servers,
+      std::string* leader_uuid,
+      int64_t* orig_term,
+      std::string* fell_behind_uuid,
+      BehindWalGcBehavior tserver_behavior = BehindWalGcBehavior::STOP_CONTINUE);
 
   CountDownLatch inserters_;
 };

http://git-wip-us.apache.org/repos/asf/kudu/blob/b5e990bf/src/kudu/integration-tests/raft_consensus_nonvoter-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/raft_consensus_nonvoter-itest.cc b/src/kudu/integration-tests/raft_consensus_nonvoter-itest.cc
index f7d5be9..84a6d94 100644
--- a/src/kudu/integration-tests/raft_consensus_nonvoter-itest.cc
+++ b/src/kudu/integration-tests/raft_consensus_nonvoter-itest.cc
@@ -1895,7 +1895,7 @@ TEST_P(IncompatibleReplicaReplacementSchemesITest, MasterAndTserverMisconfig)
{
 
   // Update corresponding flags to induce a misconfiguration between the master
   // and the tablet server.
-  ts->mutable_flags()->push_back(
+  ts->mutable_flags()->emplace_back(
       Substitute("--raft_prepare_replacement_before_eviction=$0", !is_3_4_3));
   ASSERT_OK(ts->Restart());
   if (is_incompatible_replica_management_fatal) {
@@ -1910,7 +1910,7 @@ TEST_P(IncompatibleReplicaReplacementSchemesITest, MasterAndTserverMisconfig)
{
   // Inject feature flag not supported by the master and make sure the tablet
   // server will not be registered with incompatible master.
   ts->mutable_flags()->pop_back();
-  ts->mutable_flags()->push_back("--heartbeat_inject_required_feature_flag=999");
+  ts->mutable_flags()->emplace_back("--heartbeat_inject_required_feature_flag=999");
   ts->Shutdown();
   ASSERT_OK(ts->Restart());
   if (is_incompatible_replica_management_fatal) {
@@ -1927,5 +1927,133 @@ TEST_P(IncompatibleReplicaReplacementSchemesITest, MasterAndTserverMisconfig)
{
   }
 }
 
+class ReplicaBehindWalGcThresholdITest :
+    public RaftConsensusNonVoterITest,
+    public ::testing::WithParamInterface<RaftConsensusITestBase::BehindWalGcBehavior>
{
+};
+INSTANTIATE_TEST_CASE_P(,
+    ReplicaBehindWalGcThresholdITest,
+    ::testing::Values(RaftConsensusITestBase::BehindWalGcBehavior::STOP_CONTINUE,
+                      RaftConsensusITestBase::BehindWalGcBehavior::SHUTDOWN_RESTART,
+                      RaftConsensusITestBase::BehindWalGcBehavior::SHUTDOWN));
+
+// Test that the catalog manager running with the 3-4-3 scheme is able to do
+// 'in-place' replica replacement when replica falls behind the WAL segment GC,
+// i.e. no additional tablet server is needed in case of tablet with replication
+// factor 3 when there are just 3 tablet servers in the cluster.
+TEST_P(ReplicaBehindWalGcThresholdITest, ReplicaReplacement) {
+  if (!AllowSlowTests()) {
+    LOG(WARNING) << "test is skipped; set KUDU_ALLOW_SLOW_TESTS=1 to run";
+    return;
+  }
+
+  const auto kReplicasNum = 3;
+  const auto kTimeoutSec = 60;
+  const auto kTimeout = MonoDelta::FromSeconds(kTimeoutSec);
+  const auto kUnavaiableFailedSec = 5;
+  FLAGS_num_replicas = kReplicasNum;
+  FLAGS_num_tablet_servers = kReplicasNum;
+  const auto tserver_behavior = GetParam();
+
+  vector<string> master_flags = {
+    // This scenario runs with the 3-4-3 replica management scheme.
+    "--raft_prepare_replacement_before_eviction=true",
+  };
+  if (tserver_behavior != BehindWalGcBehavior::SHUTDOWN) {
+    // This scenario verifies that the system evicts the replica that's falling
+    // behind the WAL segment GC threshold. If not shutting down the tablet
+    // server hosting the affected replica, it's necessary to avoid races with
+    // catalog manager when it replaces the replica that has just been evicted.
+    master_flags.emplace_back("--master_add_server_when_underreplicated=false");
+  }
+
+  vector<string> tserver_flags = {
+    // This scenario is specific for the 3-4-3 replica management scheme.
+    "--raft_prepare_replacement_before_eviction=true",
+
+    // Detect unavailable replicas faster.
+    Substitute("--follower_unavailable_considered_failed_sec=$0", kUnavaiableFailedSec),
+  };
+  AddFlagsForLogRolls(&tserver_flags); // For CauseFollowerToFallBehindLogGC().
+
+  NO_FATALS(BuildAndStart(tserver_flags, master_flags));
+
+  string follower_uuid;
+  string leader_uuid;
+  int64_t orig_term;
+  NO_FATALS(CauseFollowerToFallBehindLogGC(
+      tablet_servers_, &leader_uuid, &orig_term, &follower_uuid, tserver_behavior));
+
+  // The catalog manager should evict the replicas which fell behing the WAL
+  // segment GC threshold right away.
+  bool has_leader = false;
+  TabletLocationsPB tablet_locations;
+  const auto num_replicas = (tserver_behavior == BehindWalGcBehavior::SHUTDOWN)
+      ? kReplicasNum : kReplicasNum - 1;
+  ASSERT_OK(WaitForReplicasReportedToMaster(cluster_->master_proxy(),
+                                            num_replicas,
+                                            tablet_id_,
+                                            kTimeout,
+                                            WAIT_FOR_LEADER,
+                                            ANY_REPLICA,
+                                            &has_leader,
+                                            &tablet_locations));
+  consensus::ConsensusStatePB cstate;
+  ASSERT_EVENTUALLY([&] {
+    TServerDetails* leader = nullptr;
+    ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id_, &leader));
+    // The reason for the outside ASSERT_EVENTUALLY is that the leader might
+    // have changed in between of these two calls.
+    ASSERT_OK(GetConsensusState(leader, tablet_id_, kTimeout, &cstate));
+  });
+
+  if (tserver_behavior == BehindWalGcBehavior::SHUTDOWN) {
+    // The original voter replica that fell behind the WAL catchup threshold
+    // should be evicted and replaced with a non-voter replica. Since its
+    // tablet server is shutdown, the replica is not able to catch up with
+    // the leader yet (otherwise, it would be promoted to a voter replica).
+    EXPECT_TRUE(IsRaftConfigMember(follower_uuid, cstate.committed_config()))
+        << pb_util::SecureDebugString(cstate.committed_config())
+        << "fell behind WAL replica UUID: " << follower_uuid;
+    EXPECT_FALSE(IsRaftConfigVoter(follower_uuid, cstate.committed_config()))
+        << pb_util::SecureDebugString(cstate.committed_config())
+        << "fell behind WAL replica UUID: " << follower_uuid;
+    // Bring the tablet server with the affected replica back.
+    ASSERT_OK(cluster_->tablet_server_by_uuid(follower_uuid)->Restart());
+  } else {
+    // The replica that fell behind the WAL catchup threshold should be
+    // evicted and since the catalog manager is not yet adding replacement
+    // replicas, a replacement replica should not be added yet.
+    EXPECT_FALSE(IsRaftConfigMember(follower_uuid, cstate.committed_config()))
+        << pb_util::SecureDebugString(cstate.committed_config())
+        << "fell behind WAL replica UUID: " << follower_uuid;
+    // Restore back the default behavior of the catalog manager, so it would
+    // add a replacement replica.
+    for (auto idx = 0; idx < cluster_->num_masters(); ++idx) {
+      auto* master = cluster_->master(idx);
+      master->mutable_flags()->emplace_back(
+          "--master_add_server_when_underreplicated=true");
+      master->Shutdown();
+      ASSERT_OK(master->Restart());
+      ASSERT_OK(master->WaitForCatalogManager());
+    }
+  }
+
+  // The system should be able to recover, replacing the failed replica.
+  ASSERT_OK(WaitForReplicasReportedToMaster(cluster_->master_proxy(),
+                                            kReplicasNum,
+                                            tablet_id_,
+                                            kTimeout,
+                                            WAIT_FOR_LEADER,
+                                            VOTER_REPLICA,
+                                            &has_leader,
+                                            &tablet_locations));
+  NO_FATALS(cluster_->AssertNoCrashes());
+  ClusterVerifier v(cluster_.get());
+  v.SetOperationsTimeout(kTimeout);
+  v.SetVerificationTimeout(kTimeout);
+  NO_FATALS(v.CheckCluster());
+}
+
 }  // namespace tserver
 }  // namespace kudu


Mime
View raw message