kudu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ale...@apache.org
Subject kudu git commit: [consensus] FAILED_UNRECOVERABLE replica health status
Date Thu, 15 Mar 2018 05:21:16 GMT
Repository: kudu
Updated Branches:
  refs/heads/master e684de337 -> a74f9a0dc


[consensus] FAILED_UNRECOVERABLE replica health status

Added HealthStatus::FAILED_UNRECOVERABLE for a tablet replica. This is
to mark replicas which are not able to catch up with the leader due to
GC-collected segments of WAL and other unrecoverable cases.

With the introduction of the FAILED_UNRECOVERABLE health status, the
replica management scheme becomes hybrid: the system evicts replicas
with FAILED_UNRECOVERABLE health status before adding a replacement
if it anticipates that it can commit the transaction.

This patch is a part of the fix to address KUDU-2342.  It also addresses
KUDU-2322 as well: evicting voter replicas more aggressively if they
fall behind log segment GC threshold.

Change-Id: I35637c5bda6681b732dbc2bbf94b9d4258b12095
Reviewed-on: http://gerrit.cloudera.org:8080/9625
Tested-by: Alexey Serbin <aserbin@cloudera.com>
Reviewed-by: Mike Percy <mpercy@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/a74f9a0d
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/a74f9a0d
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/a74f9a0d

Branch: refs/heads/master
Commit: a74f9a0dcaf88315c8563b95cdeb5701d9ce5438
Parents: e684de3
Author: Alexey Serbin <aserbin@cloudera.com>
Authored: Tue Mar 13 22:31:32 2018 -0700
Committer: Alexey Serbin <aserbin@cloudera.com>
Committed: Thu Mar 15 05:19:07 2018 +0000

----------------------------------------------------------------------
 src/kudu/consensus/consensus_queue.cc           |  16 +-
 src/kudu/consensus/metadata.proto               |  17 +-
 src/kudu/consensus/quorum_util-test.cc          | 302 ++++++++++++++++---
 src/kudu/consensus/quorum_util.cc               | 237 +++++++++------
 .../raft_consensus_nonvoter-itest.cc            |  41 ++-
 .../ts_tablet_manager-itest.cc                  |   6 +-
 6 files changed, 455 insertions(+), 164 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/a74f9a0d/src/kudu/consensus/consensus_queue.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/consensus_queue.cc b/src/kudu/consensus/consensus_queue.cc
index 28110aa..189f499 100644
--- a/src/kudu/consensus/consensus_queue.cc
+++ b/src/kudu/consensus/consensus_queue.cc
@@ -520,7 +520,8 @@ void PeerMessageQueue::UpdatePeerHealthUnlocked(TrackedPeer* peer) {
 
   // Prepare error messages for different conditions.
   string error_msg;
-  if (overall_health_status == HealthReportPB::FAILED) {
+  if (overall_health_status == HealthReportPB::FAILED ||
+      overall_health_status == HealthReportPB::FAILED_UNRECOVERABLE) {
     if (peer->last_exchange_status == PeerStatus::TABLET_FAILED) {
       error_msg = Substitute("The tablet replica hosted on peer $0 has failed", peer->uuid());
     } else if (!peer->wal_catchup_possible) {
@@ -541,7 +542,8 @@ void PeerMessageQueue::UpdatePeerHealthUnlocked(TrackedPeer* peer) {
 
   if (FLAGS_raft_prepare_replacement_before_eviction) {
     if (changed) {
-      if (overall_health_status == HealthReportPB::FAILED) {
+      if (overall_health_status == HealthReportPB::FAILED ||
+          overall_health_status == HealthReportPB::FAILED_UNRECOVERABLE) {
         // Only log when the status changes to FAILED.
         LOG_WITH_PREFIX_UNLOCKED(INFO) << error_msg;
       }
@@ -549,7 +551,8 @@ void PeerMessageQueue::UpdatePeerHealthUnlocked(TrackedPeer* peer) {
       NotifyObserversOfPeerHealthChange();
     }
   } else {
-    if (overall_health_status == HealthReportPB::FAILED &&
+    if ((overall_health_status == HealthReportPB::FAILED ||
+         overall_health_status == HealthReportPB::FAILED_UNRECOVERABLE) &&
         SafeToEvictUnlocked(peer->uuid())) {
       NotifyObserversOfFailedFollower(peer->uuid(), queue_state_.current_term, error_msg);
     }
@@ -563,14 +566,15 @@ HealthReportPB::HealthStatus PeerMessageQueue::PeerHealthStatus(const TrackedPee
     return HealthReportPB::FAILED;
   }
 
-  // Replicas that have fallen behind the leader's retained WAL are considered failed.
+  // Replicas that have fallen behind the leader's retained WAL are failed
+  // and have no chance to come back.
   if (!peer.wal_catchup_possible) {
-    return HealthReportPB::FAILED;
+    return HealthReportPB::FAILED_UNRECOVERABLE;
   }
 
   // Replicas returning TABLET_FAILED status are considered failed.
   if (peer.last_exchange_status == PeerStatus::TABLET_FAILED) {
-    return HealthReportPB::FAILED;
+    return HealthReportPB::FAILED_UNRECOVERABLE;
   }
 
   // The happy case: replicas returned OK during the recent exchange are considered healthy.

http://git-wip-us.apache.org/repos/asf/kudu/blob/a74f9a0d/src/kudu/consensus/metadata.proto
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/metadata.proto b/src/kudu/consensus/metadata.proto
index a310127..6d21a41 100644
--- a/src/kudu/consensus/metadata.proto
+++ b/src/kudu/consensus/metadata.proto
@@ -40,9 +40,20 @@ message RaftPeerAttrsPB {
 // Report on a replica's (peer's) health.
 message HealthReportPB {
   enum HealthStatus {
-    UNKNOWN = 999;  // No information on the health status.
-    FAILED = 0;     // Replica has failed and needs replacement.
-    HEALTHY = 1;    // Replica is functioning properly.
+    // No information on the health status.
+    UNKNOWN = 999;
+
+    // Replica has failed and needs replacement. The failure might be a
+    // transient one, so replica may return to a healthy state soon.
+    FAILED = 0;
+
+    // Replica is functioning properly.
+    HEALTHY = 1;
+
+    // Replica has failed in an irreversible and unrecoverable way and needs
+    // replacement. The failure is permanent and the replica definitely cannot
+    // return to a healthy state.
+    FAILED_UNRECOVERABLE = 2;
   }
 
   // Overall health status of a replica. Reflects at least the responsiveness

http://git-wip-us.apache.org/repos/asf/kudu/blob/a74f9a0d/src/kudu/consensus/quorum_util-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/quorum_util-test.cc b/src/kudu/consensus/quorum_util-test.cc
index 9dc4d7a..f98e819 100644
--- a/src/kudu/consensus/quorum_util-test.cc
+++ b/src/kudu/consensus/quorum_util-test.cc
@@ -48,7 +48,7 @@ constexpr auto MHP_H = MajorityHealthPolicy::HONOR; // NOLINT(readability-identi
 constexpr auto MHP_I = MajorityHealthPolicy::IGNORE;// NOLINT(readability-identifier-naming)
 
 // The various possible health statuses.
-constexpr auto kHealthStatuses = { '?', '-', '+' };
+constexpr auto kHealthStatuses = { '?', '-', 'x', '+' };
 
 typedef std::pair<string, bool> Attr;
 
@@ -61,6 +61,9 @@ static void SetOverallHealth(HealthReportPB* health_report,
     case '-':
       health_report->set_overall_health(HealthReportPB::FAILED);
       break;
+    case 'x':
+      health_report->set_overall_health(HealthReportPB::FAILED_UNRECOVERABLE);
+      break;
     case '?':
       health_report->set_overall_health(HealthReportPB::UNKNOWN);
       break;
@@ -362,6 +365,24 @@ TEST_P(QuorumUtilHealthPolicyParamTest, ShouldAddReplica) {
       EXPECT_TRUE(ShouldAddReplica(config, 4, policy));
     }
   }
+  for (auto health_status : { '-', 'x' }) {
+    SCOPED_TRACE(Substitute("health status '$0'", health_status));
+    RaftConfigPB config;
+    AddPeer(&config, "A", V, health_status);
+    AddPeer(&config, "B", V, health_status);
+    AddPeer(&config, "C", V, health_status);
+    if (policy == MHP_H) {
+      // The configuration is under-replicated, but there are not enough healthy
+      // voters to commit the configuration change.
+      EXPECT_FALSE(ShouldAddReplica(config, 4, policy));
+      EXPECT_FALSE(ShouldAddReplica(config, 2, policy));
+      EXPECT_FALSE(ShouldAddReplica(config, 3, policy));
+    } else {
+      EXPECT_TRUE(ShouldAddReplica(config, 4, policy));
+      EXPECT_TRUE(ShouldAddReplica(config, 2, policy));
+      EXPECT_TRUE(ShouldAddReplica(config, 3, policy));
+    }
+  }
   {
     RaftConfigPB config;
     AddPeer(&config, "A", V, '?');
@@ -399,10 +420,11 @@ TEST_P(QuorumUtilHealthPolicyParamTest, ShouldAddReplica) {
       EXPECT_TRUE(ShouldAddReplica(config, 3, policy));
     }
   }
-  {
+  for (auto health_status : { '-', 'x' }) {
+    SCOPED_TRACE(Substitute("health status '$0'", health_status));
     RaftConfigPB config;
     AddPeer(&config, "A", V, '?');
-    AddPeer(&config, "B", V, '-');
+    AddPeer(&config, "B", V, health_status);
     AddPeer(&config, "C", N, '+');
     // The configuration is over-replicated already.
     EXPECT_FALSE(ShouldAddReplica(config, 1, policy));
@@ -415,10 +437,11 @@ TEST_P(QuorumUtilHealthPolicyParamTest, ShouldAddReplica) {
       EXPECT_TRUE(ShouldAddReplica(config, 3, policy));
     }
   }
-  {
+  for (auto health_status : { '-', 'x' }) {
+    SCOPED_TRACE(Substitute("health status '$0'", health_status));
     RaftConfigPB config;
     AddPeer(&config, "A", V, '?');
-    AddPeer(&config, "B", V, '-');
+    AddPeer(&config, "B", V, health_status);
     AddPeer(&config, "C", N, '+', {{"PROMOTE", true}});
     EXPECT_FALSE(ShouldAddReplica(config, 1, policy));
     EXPECT_FALSE(ShouldAddReplica(config, 2, policy));
@@ -430,11 +453,12 @@ TEST_P(QuorumUtilHealthPolicyParamTest, ShouldAddReplica) {
       EXPECT_TRUE(ShouldAddReplica(config, 3, policy));
     }
   }
-  {
+  for (auto health_status : { '-', 'x' }) {
+    SCOPED_TRACE(Substitute("health status '$0'", health_status));
     RaftConfigPB config;
     AddPeer(&config, "A", V, '?');
-    AddPeer(&config, "B", V, '-');
-    AddPeer(&config, "C", N, '-', {{"PROMOTE", true}});
+    AddPeer(&config, "B", V, health_status);
+    AddPeer(&config, "C", N, health_status, {{"PROMOTE", true}});
     EXPECT_FALSE(ShouldAddReplica(config, 1, policy));
     if (policy == MHP_H) {
       EXPECT_FALSE(ShouldAddReplica(config, 2, policy));
@@ -444,11 +468,12 @@ TEST_P(QuorumUtilHealthPolicyParamTest, ShouldAddReplica) {
       EXPECT_TRUE(ShouldAddReplica(config, 3, policy));
     }
   }
-  {
+  for (auto health_status : { '-', 'x' }) {
+    SCOPED_TRACE(Substitute("health status '$0'", health_status));
     RaftConfigPB config;
     AddPeer(&config, "A", V, '+');
     AddPeer(&config, "B", V, '+');
-    AddPeer(&config, "C", V, '-');
+    AddPeer(&config, "C", V, health_status);
     EXPECT_FALSE(ShouldAddReplica(config, 2, policy));
     EXPECT_TRUE(ShouldAddReplica(config, 3, policy));
   }
@@ -490,20 +515,22 @@ TEST_P(QuorumUtilHealthPolicyParamTest, ShouldAddReplica) {
     EXPECT_FALSE(ShouldAddReplica(config, 3, policy));
     EXPECT_FALSE(ShouldAddReplica(config, 2, policy));
   }
-  {
+  for (auto health_status : { '-', 'x' }) {
+    SCOPED_TRACE(Substitute("health status '$0'", health_status));
     RaftConfigPB config;
     AddPeer(&config, "A", V, '+');
     AddPeer(&config, "B", V, '+');
-    AddPeer(&config, "C", V, '-');
-    AddPeer(&config, "D", N, '-');
+    AddPeer(&config, "C", V, health_status);
+    AddPeer(&config, "D", N, health_status);
     EXPECT_FALSE(ShouldAddReplica(config, 2, policy));
     EXPECT_TRUE(ShouldAddReplica(config, 3, policy));
   }
-  {
+  for (auto health_status : { '-', 'x' }) {
+    SCOPED_TRACE(Substitute("health status '$0'", health_status));
     RaftConfigPB config;
     AddPeer(&config, "A", V, '+');
     AddPeer(&config, "B", V, '+');
-    AddPeer(&config, "C", V, '-');
+    AddPeer(&config, "C", V, health_status);
     AddPeer(&config, "D", N, '+');
     EXPECT_FALSE(ShouldAddReplica(config, 2, policy));
     // The non-voter replica does not have the PROMOTE attribute,
@@ -511,48 +538,53 @@ TEST_P(QuorumUtilHealthPolicyParamTest, ShouldAddReplica) {
     EXPECT_TRUE(ShouldAddReplica(config, 3, policy));
     EXPECT_TRUE(ShouldAddReplica(config, 4, policy));
   }
-  {
+  for (auto health_status : { '-', 'x' }) {
+    SCOPED_TRACE(Substitute("health status '$0'", health_status));
     RaftConfigPB config;
     AddPeer(&config, "A", V, '+');
     AddPeer(&config, "B", V, '+');
-    AddPeer(&config, "C", V, '-');
+    AddPeer(&config, "C", V, health_status);
     AddPeer(&config, "D", N, '+', {{"PROMOTE", true}});
     EXPECT_FALSE(ShouldAddReplica(config, 2, policy));
     EXPECT_FALSE(ShouldAddReplica(config, 3, policy));
     EXPECT_TRUE(ShouldAddReplica(config, 4, policy));
   }
-  {
+  for (auto health_status : { '-', 'x' }) {
+    SCOPED_TRACE(Substitute("health status '$0'", health_status));
     RaftConfigPB config;
     AddPeer(&config, "A", V, '+');
     AddPeer(&config, "B", V, '+');
-    AddPeer(&config, "C", V, '-');
-    AddPeer(&config, "D", N, '-', {{"PROMOTE", true}});
+    AddPeer(&config, "C", V, health_status);
+    AddPeer(&config, "D", N, health_status, {{"PROMOTE", true}});
     EXPECT_FALSE(ShouldAddReplica(config, 2, policy));
     EXPECT_TRUE(ShouldAddReplica(config, 3, policy));
   }
-  {
+  for (auto health_status : { '-', 'x' }) {
+    SCOPED_TRACE(Substitute("health status '$0'", health_status));
     RaftConfigPB config;
-    AddPeer(&config, "A", V, '-');
+    AddPeer(&config, "A", V, health_status);
     AddPeer(&config, "B", V, '+');
     AddPeer(&config, "C", V, '+');
-    AddPeer(&config, "D", N, '-', {{"PROMOTE", true}});
+    AddPeer(&config, "D", N, health_status, {{"PROMOTE", true}});
     AddPeer(&config, "E", N, '+', {{"PROMOTE", true}});
     EXPECT_FALSE(ShouldAddReplica(config, 3, policy));
   }
-  {
+  for (auto health_status : { '-', 'x' }) {
+    SCOPED_TRACE(Substitute("health status '$0'", health_status));
     RaftConfigPB config;
-    AddPeer(&config, "A", V, '-');
+    AddPeer(&config, "A", V, health_status);
     AddPeer(&config, "B", V, '+');
     AddPeer(&config, "C", V, '+');
-    AddPeer(&config, "D", N, '-', {{"PROMOTE", true}});
+    AddPeer(&config, "D", N, health_status, {{"PROMOTE", true}});
     AddPeer(&config, "E", N, '+', {{"PROMOTE", false}});
     EXPECT_TRUE(ShouldAddReplica(config, 3, policy));
   }
-  {
+  for (auto health_status : { '-', 'x' }) {
+    SCOPED_TRACE(Substitute("health status '$0'", health_status));
     RaftConfigPB config;
     AddPeer(&config, "A", V, '+');
-    AddPeer(&config, "B", V, '-');
-    AddPeer(&config, "C", V, '-');
+    AddPeer(&config, "B", V, health_status);
+    AddPeer(&config, "C", V, health_status);
     if (policy == MHP_H) {
       // If honoring the health of the replica's majority, the catalog manager
       // will not add a new non-voter replica until the situation is resolved.
@@ -561,12 +593,13 @@ TEST_P(QuorumUtilHealthPolicyParamTest, ShouldAddReplica) {
       EXPECT_TRUE(ShouldAddReplica(config, 3, policy));
     }
   }
-  {
+  for (auto health_status : { '-', 'x' }) {
+    SCOPED_TRACE(Substitute("health status '$0'", health_status));
     RaftConfigPB config;
     AddPeer(&config, "A", V, '+');
-    AddPeer(&config, "B", V, '-');
+    AddPeer(&config, "B", V, health_status);
     AddPeer(&config, "C", V, '+');
-    AddPeer(&config, "D", V, '-');
+    AddPeer(&config, "D", V, health_status);
     AddPeer(&config, "E", V, '+');
     EXPECT_FALSE(ShouldAddReplica(config, 3, policy));
     EXPECT_TRUE(ShouldAddReplica(config, 4, policy));
@@ -640,19 +673,26 @@ TEST(QuorumUtilTest, ShouldEvictReplicaVoters) {
     //   * failed & slated for replacement
     //   * failed
     //   * ...
-    if (health_status == '-') {
+    if (health_status == '-' || health_status == 'x') {
       EXPECT_EQ("D", to_evict);
     } else {
       EXPECT_EQ("C", to_evict);
     }
     ASSERT_TRUE(ShouldEvictReplica(config, "A", 3, MHP_H, &to_evict));
-    if (health_status == '-') {
+    if (health_status == '-' || health_status == 'x') {
       EXPECT_EQ("D", to_evict);
     } else {
       EXPECT_EQ("C", to_evict);
     }
-    // Since we are not over-replicated, we will not evict in this case.
-    EXPECT_FALSE(ShouldEvictReplica(config, "A", 4, MHP_H));
+    if (health_status == 'x') {
+      // Unrecoverably failed replica should be evicted even if the configuration
+      // is not over-replicated if it's safe to commit the configuration change.
+      ASSERT_TRUE(ShouldEvictReplica(config, "A", 4, MHP_H));
+      EXPECT_EQ("D", to_evict);
+    } else {
+      // Since we are not over-replicated, we will not evict in this case.
+      EXPECT_FALSE(ShouldEvictReplica(config, "A", 4, MHP_H));
+    }
   }
   {
     RaftConfigPB config;
@@ -684,6 +724,36 @@ TEST(QuorumUtilTest, ShouldEvictReplicaVoters) {
     ASSERT_TRUE(ShouldEvictReplica(config, "A", 3, MHP_H, &to_evict));
     EXPECT_EQ("B", to_evict);
   }
+  {
+    RaftConfigPB config;
+    AddPeer(&config, "A", V, '+');
+    AddPeer(&config, "B", V, 'x');
+    AddPeer(&config, "C", V, '+');
+    string to_evict;
+    ASSERT_TRUE(ShouldEvictReplica(config, "A", 3, MHP_H, &to_evict));
+    EXPECT_EQ("B", to_evict);
+    ASSERT_TRUE(ShouldEvictReplica(config, "A", 2, MHP_H, &to_evict));
+    EXPECT_EQ("B", to_evict);
+  }
+  {
+    RaftConfigPB config;
+    AddPeer(&config, "A", V, '+');
+    AddPeer(&config, "B", V, '-');
+    AddPeer(&config, "C", V, 'x');
+
+    // No majority to commit the change.
+    EXPECT_FALSE(ShouldEvictReplica(config, "A", 3, MHP_H));
+    EXPECT_FALSE(ShouldEvictReplica(config, "A", 2, MHP_H));
+
+    // If ignoring the safety rules, it tries to evict even if the majority
+    // of replicas are not online. Among failed replicas, replicas failed
+    // irreverisbly are evicted first.
+    string to_evict;
+    ASSERT_TRUE(ShouldEvictReplica(config, "A", 3, MHP_I, &to_evict));
+    EXPECT_EQ("C", to_evict);
+    ASSERT_TRUE(ShouldEvictReplica(config, "A", 2, MHP_I, &to_evict));
+    EXPECT_EQ("C", to_evict);
+  }
 }
 
 // Verify logic of the kudu::consensus::ShouldEvictReplica(), anticipating
@@ -698,25 +768,32 @@ TEST_P(QuorumUtilHealthPolicyParamTest, ShouldEvictReplicaVoters) {
     EXPECT_FALSE(ShouldEvictReplica(config, "", 2, policy));
     EXPECT_FALSE(ShouldEvictReplica(config, "", 3, policy));
   }
-  {
+  for (auto health_status : { '-', 'x' }) {
+    SCOPED_TRACE(Substitute("health status '$0'", health_status));
     RaftConfigPB config;
     AddPeer(&config, "A", V, '?');
     AddPeer(&config, "B", V, '?');
-    AddPeer(&config, "C", V, '-');
+    AddPeer(&config, "C", V, health_status);
     EXPECT_FALSE(ShouldEvictReplica(config, "", 3, policy));
     EXPECT_FALSE(ShouldEvictReplica(config, "", 2, policy));
   }
-  {
+  for (auto health_status : { '-', 'x' }) {
+    SCOPED_TRACE(Substitute("health status '$0'", health_status));
     RaftConfigPB config;
     AddPeer(&config, "A", V, '+');
     AddPeer(&config, "B", V, '+');
-    AddPeer(&config, "C", V, '-');
+    AddPeer(&config, "C", V, health_status);
     string to_evict;
     ASSERT_TRUE(ShouldEvictReplica(config, "A", 1, policy, &to_evict));
     EXPECT_EQ("C", to_evict);
     ASSERT_TRUE(ShouldEvictReplica(config, "A", 2, policy));
     EXPECT_EQ("C", to_evict);
-    EXPECT_FALSE(ShouldEvictReplica(config, "A", 3, policy));
+    if (health_status == '-') {
+      EXPECT_FALSE(ShouldEvictReplica(config, "A", 3, policy));
+    } else {
+      ASSERT_TRUE(ShouldEvictReplica(config, "A", 3, policy, &to_evict));
+      EXPECT_EQ("C", to_evict);
+    }
   }
   {
     RaftConfigPB config;
@@ -725,6 +802,19 @@ TEST_P(QuorumUtilHealthPolicyParamTest, ShouldEvictReplicaVoters) {
     AddPeer(&config, "C", V, '+');
     EXPECT_FALSE(ShouldEvictReplica(config, "C", 3, policy));
   }
+  {
+    RaftConfigPB config;
+    AddPeer(&config, "A", V, '+');
+    AddPeer(&config, "B", V, '?');
+    AddPeer(&config, "C", V, 'x');
+    if (policy == MHP_H) {
+      EXPECT_FALSE(ShouldEvictReplica(config, "A", 3, policy));
+    } else {
+      string to_evict;
+      ASSERT_TRUE(ShouldEvictReplica(config, "A", 3, policy, &to_evict));
+      EXPECT_EQ("C", to_evict);
+    }
+  }
 }
 
 // Verify logic of the kudu::consensus::ShouldEvictReplica(), anticipating
@@ -871,6 +961,42 @@ TEST_P(QuorumUtilHealthPolicyParamTest, ShouldEvictReplicaNonVoters) {
     EXPECT_FALSE(ShouldEvictReplica(config, "B", 3, policy));
     EXPECT_FALSE(ShouldEvictReplica(config, "B", 4, policy));
   }
+  {
+    RaftConfigPB config;
+    AddPeer(&config, "A", V, '-');
+    AddPeer(&config, "B", V, '+');
+    AddPeer(&config, "C", V, '+');
+    AddPeer(&config, "D", N, 'x', {{"PROMOTE", true}});
+    string to_evict;
+    ASSERT_TRUE(ShouldEvictReplica(config, "B", 3, policy, &to_evict));
+    EXPECT_EQ("D", to_evict);
+    ASSERT_TRUE(ShouldEvictReplica(config, "B", 4, policy, &to_evict));
+    EXPECT_EQ("D", to_evict);
+  }
+  {
+    RaftConfigPB config;
+    AddPeer(&config, "A", V, 'x');
+    AddPeer(&config, "B", V, '+');
+    AddPeer(&config, "C", V, '+');
+    AddPeer(&config, "D", N, 'x');
+    string to_evict;
+    ASSERT_TRUE(ShouldEvictReplica(config, "B", 3, policy, &to_evict));
+    EXPECT_EQ("D", to_evict);
+    ASSERT_TRUE(ShouldEvictReplica(config, "B", 4, policy, &to_evict));
+    EXPECT_EQ("D", to_evict);
+  }
+  {
+    RaftConfigPB config;
+    AddPeer(&config, "A", V, 'x');
+    AddPeer(&config, "B", V, '+');
+    AddPeer(&config, "C", V, '+');
+    AddPeer(&config, "D", N, '-');
+    string to_evict;
+    ASSERT_TRUE(ShouldEvictReplica(config, "B", 3, policy, &to_evict));
+    EXPECT_EQ("A", to_evict);
+    ASSERT_TRUE(ShouldEvictReplica(config, "B", 4, policy, &to_evict));
+    EXPECT_EQ("A", to_evict);
+  }
 }
 
 TEST_P(QuorumUtilHealthPolicyParamTest, DontEvictLeader) {
@@ -945,7 +1071,7 @@ TEST_P(QuorumUtilHealthPolicyParamTest, ReplaceAttributeBasic) {
     EXPECT_FALSE(ShouldEvictReplica(config, "A", 3, policy));
     EXPECT_FALSE(ShouldAddReplica(config, 3, policy));
   }
-  for (auto health_status : { '-', '?' }) {
+  for (auto health_status : { '-', '?', 'x' }) {
     RaftConfigPB config;
     AddPeer(&config, "A", V, health_status, {{"REPLACE", true}});
     AddPeer(&config, "B", V, '+');
@@ -995,7 +1121,7 @@ TEST_P(QuorumUtilHealthPolicyParamTest, ReplaceAttributeBasic) {
     }
     EXPECT_TRUE(ShouldAddReplica(config, 3, policy));
   }
-  for (auto health_status : { '?', '-' }) {
+  for (auto health_status : { '?', '-', 'x' }) {
     RaftConfigPB config;
     AddPeer(&config, "A", V, health_status, {{"REPLACE", true}});
     AddPeer(&config, "B", V, '+', {{"REPLACE", true}});
@@ -1030,7 +1156,7 @@ TEST_P(QuorumUtilHealthPolicyParamTest, ReplaceAttributeBasic) {
     EXPECT_FALSE(ShouldEvictReplica(config, "B", 3, policy));
     EXPECT_TRUE(ShouldAddReplica(config, 3, policy));
   }
-  for (auto health_status : { '?', '-' }) {
+  for (auto health_status : { '?', '-', 'x' }) {
     RaftConfigPB config;
     AddPeer(&config, "A", V, '+', {{"REPLACE", true}});
     AddPeer(&config, "B", V, '+', {{"REPLACE", true}});
@@ -1309,6 +1435,94 @@ TEST(QuorumUtilTest, ShouldEvictReplicaNonVoters) {
   EXPECT_EQ("C", to_evict);
 }
 
+// A scenario of replica replacement where replicas fall behind the log segment
+// GC threshold and are replaced accordingly. This scenario is written to
+// address scenarios like of KUDU-2342.
+TEST(QuorumUtilTest, NewlyAddedNonVoterFallsBehindLogGC) {
+  constexpr auto kReplicationFactor = 3;
+  constexpr auto kPolicy = MajorityHealthPolicy::HONOR;
+
+  RaftConfigPB config;
+  AddPeer(&config, "A", V, '+');
+  AddPeer(&config, "B", V, '+');
+  AddPeer(&config, "C", V, '+');
+
+  EXPECT_FALSE(ShouldEvictReplica(config, "A", kReplicationFactor, kPolicy));
+  EXPECT_FALSE(ShouldAddReplica(config, kReplicationFactor, kPolicy));
+
+  // Replica B falls behind the log segment GC threshold. Since this is an
+  // irreverisble failure, system tries to evict the replica right away.
+  SetPeerHealth(&config, "B", 'x');
+  string to_evict;
+  ASSERT_TRUE(ShouldEvictReplica(config, "A", kReplicationFactor, kPolicy, &to_evict));
+  EXPECT_EQ("B", to_evict);
+
+  RemovePeer(&config, to_evict);
+  EXPECT_FALSE(ShouldEvictReplica(config, "A", kReplicationFactor, kPolicy));
+  EXPECT_TRUE(ShouldAddReplica(config, kReplicationFactor, kPolicy));
+
+  // Adding a non-voter to replace B.
+  AddPeer(&config, "D", N, '?', {{"PROMOTE", true}});
+  EXPECT_FALSE(ShouldEvictReplica(config, "A", kReplicationFactor, kPolicy));
+  EXPECT_FALSE(ShouldAddReplica(config, kReplicationFactor, kPolicy));
+
+  // The new non-voter replica becomes healthy.
+  SetPeerHealth(&config, "D", '+');
+  EXPECT_FALSE(ShouldEvictReplica(config, "A", kReplicationFactor, kPolicy));
+  EXPECT_FALSE(ShouldAddReplica(config, kReplicationFactor, kPolicy));
+
+  // The new non-voter replica falls behind the log segment GC threshold. The
+  // system should evict it before trying to add a replacement replica.
+  SetPeerHealth(&config, "D", 'x');
+  ASSERT_TRUE(ShouldEvictReplica(config, "A", kReplicationFactor, kPolicy, &to_evict));
+  EXPECT_EQ("D", to_evict);
+  RemovePeer(&config, to_evict);
+
+  // A new non-voter replica is needed.
+  EXPECT_FALSE(ShouldEvictReplica(config, "A", kReplicationFactor, kPolicy));
+  EXPECT_TRUE(ShouldAddReplica(config, kReplicationFactor, kPolicy));
+
+  // Adding a non-voter to replace D.
+  AddPeer(&config, "E", N, '?', {{"PROMOTE", true}});
+  EXPECT_FALSE(ShouldEvictReplica(config, "A", kReplicationFactor, kPolicy));
+  EXPECT_FALSE(ShouldAddReplica(config, kReplicationFactor, kPolicy));
+
+  // The new non-voter replica 'E' becomes healthy.
+  SetPeerHealth(&config, "E", '+');
+  EXPECT_FALSE(ShouldEvictReplica(config, "A", kReplicationFactor, kPolicy));
+  EXPECT_FALSE(ShouldAddReplica(config, kReplicationFactor, kPolicy));
+
+  // The newly added replica gets promoted to voter.
+  PromotePeer(&config, "E");
+  EXPECT_FALSE(ShouldEvictReplica(config, "A", kReplicationFactor, kPolicy));
+  EXPECT_FALSE(ShouldAddReplica(config, kReplicationFactor, kPolicy));
+
+  // The new voter replica E falls behind the log segment GC threshold. The
+  // replica should be evicted.
+  SetPeerHealth(&config, "E", 'x');
+  ASSERT_TRUE(ShouldEvictReplica(config, "A", kReplicationFactor, kPolicy, &to_evict));
+  EXPECT_EQ("E", to_evict);
+
+  RemovePeer(&config, to_evict);
+  EXPECT_FALSE(ShouldEvictReplica(config, "A", kReplicationFactor, kPolicy));
+  EXPECT_TRUE(ShouldAddReplica(config, kReplicationFactor, kPolicy));
+
+  // The system should add a replacement for the evicted replica.
+  AddPeer(&config, "F", N, '?', {{"PROMOTE", true}});
+  EXPECT_FALSE(ShouldEvictReplica(config, "A", kReplicationFactor, kPolicy));
+  EXPECT_FALSE(ShouldAddReplica(config, kReplicationFactor, kPolicy));
+
+  // The new non-voter replica 'F' becomes healthy.
+  SetPeerHealth(&config, "F", '+');
+  EXPECT_FALSE(ShouldEvictReplica(config, "A", kReplicationFactor, kPolicy));
+  EXPECT_FALSE(ShouldAddReplica(config, kReplicationFactor, kPolicy));
+
+  // The newly added replica 'F' gets promoted to voter, all is well now.
+  PromotePeer(&config, "F");
+  EXPECT_FALSE(ShouldEvictReplica(config, "A", kReplicationFactor, kPolicy));
+  EXPECT_FALSE(ShouldAddReplica(config, kReplicationFactor, kPolicy));
+}
+
 // A scenario of replica replacement where the replica added for replacement
 // of a failed one also fails. The system should end up replacing both failed
 // replicas.

http://git-wip-us.apache.org/repos/asf/kudu/blob/a74f9a0d/src/kudu/consensus/quorum_util.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/quorum_util.cc b/src/kudu/consensus/quorum_util.cc
index 97c006b..1c0fc71 100644
--- a/src/kudu/consensus/quorum_util.cc
+++ b/src/kudu/consensus/quorum_util.cc
@@ -17,7 +17,9 @@
 #include "kudu/consensus/quorum_util.h"
 
 #include <map>
+#include <memory>
 #include <ostream>
+#include <queue>
 #include <set>
 #include <string>
 #include <utility>
@@ -26,6 +28,7 @@
 #include <glog/logging.h>
 
 #include "kudu/common/common.pb.h"
+#include "kudu/gutil/macros.h"
 #include "kudu/gutil/map-util.h"
 #include "kudu/gutil/port.h"
 #include "kudu/gutil/strings/join.h"
@@ -38,6 +41,7 @@ using kudu::pb_util::SecureShortDebugString;
 using kudu::pb_util::SecureDebugString;
 using std::map;
 using std::pair;
+using std::priority_queue;
 using std::set;
 using std::string;
 using std::vector;
@@ -422,20 +426,30 @@ bool ShouldAddReplica(const RaftConfigPB& config,
   // the appropriate default values of those fields.
   VLOG(2) << "config to evaluate: " << SecureDebugString(config);
   for (const RaftPeerPB& peer : config.peers()) {
+    const auto overall_health = peer.health_report().overall_health();
     switch (peer.member_type()) {
       case RaftPeerPB::VOTER:
         ++num_voters_total;
         if (peer.attrs().replace() ||
-            peer.health_report().overall_health() == HealthReportPB::FAILED) {
+            overall_health == HealthReportPB::FAILED ||
+            overall_health == HealthReportPB::FAILED_UNRECOVERABLE) {
           ++num_voters_need_replacement;
         }
-        if (peer.health_report().overall_health() == HealthReportPB::HEALTHY) {
+        if (overall_health == HealthReportPB::HEALTHY) {
           ++num_voters_healthy;
         }
         break;
       case RaftPeerPB::NON_VOTER:
         if (peer.attrs().promote() &&
-            peer.health_report().overall_health() != HealthReportPB::FAILED) {
+            overall_health != HealthReportPB::FAILED &&
+            overall_health != HealthReportPB::FAILED_UNRECOVERABLE) {
+          // A replica with HEALTHY or UNKNOWN overall health status
+          // is considered as a replica to promote: a new non-voter replica is
+          // added with UNKNOWN health status. If such a replica is not
+          // responsive for a long time, then its state will change to
+          // HealthReportPB::FAILED after some time and it will be evicted. But
+          // before that, it's considered as a candidate for promotion in the
+          // code below.
           ++num_non_voters_to_promote;
         }
         break;
@@ -469,8 +483,61 @@ bool ShouldEvictReplica(const RaftConfigPB& config,
                         int replication_factor,
                         MajorityHealthPolicy policy,
                         string* uuid_to_evict) {
-  // If there is no leader, we can't evict anybody.
-  if (leader_uuid.empty()) return false;
+  if (leader_uuid.empty()) {
+    // If there is no leader, we can't evict anybody.
+    return false;
+  }
+
+  typedef pair<string, int> Elem;
+  static const auto kCmp = [](const Elem& lhs, const Elem& rhs) {
+    // Elements of higher priorty should pop up to the top of the queue.
+    return lhs.second < rhs.second;
+  };
+  typedef priority_queue<Elem, vector<Elem>, decltype(kCmp)> PeerPriorityQueue;
+
+  PeerPriorityQueue pq_non_voters(kCmp);
+  PeerPriorityQueue pq_voters(kCmp);
+
+  const auto peer_to_elem = [](const RaftPeerPB& peer) {
+    const string& peer_uuid = peer.permanent_uuid();
+    const auto overall_health = peer.health_report().overall_health();
+
+    // Non-voter candidates for eviction (in decreasing priority):
+    //   * failed unrecoverably
+    //   * failed
+    //   * in unknown health state
+    //   * any other
+    //
+    // Voter candidates for eviction (in decreasing priority):
+    //   * failed unrecoverably and having the attribute REPLACE set
+    //   * failed unrecoverably
+    //   * failed and having the attribute REPLACE set
+    //   * failed
+    //   * having the attribute REPLACE set
+    //   * in unknown health state
+    //   * any other
+
+    int priority = 0;
+    switch (overall_health) {
+      case HealthReportPB::FAILED_UNRECOVERABLE:
+        priority = 8;
+        break;
+      case HealthReportPB::FAILED:
+        priority = 4;
+        break;
+      case HealthReportPB::HEALTHY:
+        priority = 0;
+        break;
+      case HealthReportPB::UNKNOWN:   FALLTHROUGH_INTENDED;
+      default:
+        priority = 1;
+        break;
+    }
+    if (peer.member_type() == RaftPeerPB::VOTER && peer.attrs().replace()) {
+      priority += 2;
+    }
+    return Elem(peer_uuid, priority);
+  };
 
   int num_non_voters_total = 0;
 
@@ -479,17 +546,14 @@ bool ShouldEvictReplica(const RaftConfigPB& config,
   int num_voters_with_replace = 0;
   int num_voters_viable = 0;
 
-  string non_voter_any;
-  string non_voter_failed;
-  string non_voter_unknown_health;
-
-  string voter_any;
-  string voter_failed;
-  string voter_replace;
-  string voter_unknown_health;
-
   bool leader_with_replace = false;
 
+  bool has_non_voter_failed = false;
+  bool has_non_voter_failed_unrecoverable = false;
+  bool has_voter_failed = false;
+  bool has_voter_failed_unrecoverable = false;
+  bool has_voter_unknown_health = false;
+
   // While working with the optional fields related to per-replica health status
   // and attributes, has_a_field()-like methods are not called because of
   // the appropriate default values of those fields.
@@ -497,11 +561,13 @@ bool ShouldEvictReplica(const RaftConfigPB& config,
   for (const RaftPeerPB& peer : config.peers()) {
     DCHECK(peer.has_permanent_uuid() && !peer.permanent_uuid().empty());
     const string& peer_uuid = peer.permanent_uuid();
-    const bool failed = peer.health_report().overall_health() == HealthReportPB::FAILED;
-    const bool healthy = peer.health_report().overall_health() == HealthReportPB::HEALTHY;
+    const auto overall_health = peer.health_report().overall_health();
+    const bool failed = overall_health == HealthReportPB::FAILED;
+    const bool failed_unrecoverable = overall_health == HealthReportPB::FAILED_UNRECOVERABLE;
+    const bool healthy = overall_health == HealthReportPB::HEALTHY;
     const bool unknown = !peer.has_health_report() ||
         !peer.health_report().has_overall_health() ||
-        peer.health_report().overall_health() == HealthReportPB::UNKNOWN;
+        overall_health == HealthReportPB::UNKNOWN;
     const bool has_replace = peer.attrs().replace();
 
     switch (peer.member_type()) {
@@ -538,36 +604,20 @@ bool ShouldEvictReplica(const RaftConfigPB& config,
           // replica is not to be evicted.
           break;
         }
-        if (failed && has_replace) {
-          voter_failed = voter_replace = peer_uuid;
-        }
-        if (failed && voter_failed.empty()) {
-          voter_failed = peer_uuid;
-        }
-        if (unknown && has_replace &&
-            (voter_replace.empty() || voter_replace != voter_failed)) {
-          voter_replace = voter_unknown_health = peer_uuid;
-        }
-        if (has_replace && voter_replace.empty()) {
-          voter_replace = peer_uuid;
-        }
-        if (unknown && voter_unknown_health.empty()) {
-          voter_unknown_health = peer_uuid;
-        }
-        voter_any = peer_uuid;
+
+        pq_voters.emplace(peer_to_elem(peer));
+        has_voter_failed |= failed;
+        has_voter_failed_unrecoverable |= failed_unrecoverable;
+        has_voter_unknown_health |= unknown;
         break;
 
       case RaftPeerPB::NON_VOTER:
-        ++num_non_voters_total;
         DCHECK_NE(peer_uuid, leader_uuid) << peer_uuid
             << ": non-voter as a leader; " << SecureShortDebugString(config);
-        if (failed && non_voter_failed.empty()) {
-          non_voter_failed = peer_uuid;
-        }
-        if (unknown && non_voter_unknown_health.empty()) {
-          non_voter_unknown_health = peer_uuid;
-        }
-        non_voter_any = peer_uuid;
+        pq_non_voters.emplace(peer_to_elem(peer));
+        ++num_non_voters_total;
+        has_non_voter_failed |= failed;
+        has_non_voter_failed_unrecoverable |= failed_unrecoverable;
         break;
 
       default:
@@ -576,12 +626,9 @@ bool ShouldEvictReplica(const RaftConfigPB& config,
     }
   }
 
-  // A few sanity checks: the leader replica UUID should not be among those
-  // to evict.
-  DCHECK_NE(leader_uuid, voter_any);
-  DCHECK_NE(leader_uuid, voter_failed);
-  DCHECK_NE(leader_uuid, voter_replace);
-  DCHECK_NE(leader_uuid, voter_unknown_health);
+  // Sanity check: the leader replica UUID should not be among those to evict.
+  DCHECK(pq_voters.empty() || pq_voters.top().first != leader_uuid);
+  DCHECK(pq_non_voters.empty() || pq_non_voters.top().first != leader_uuid);
 
   // A conservative approach is used when evicting replicas. In short, the
   // removal of replicas from the tablet without exact knowledge of their health
@@ -604,18 +651,18 @@ bool ShouldEvictReplica(const RaftConfigPB& config,
   //   the required replication factor, while a present non-voter replica could
   //   be a good fit to replace a voter replica, if needed.
   //
-  // * A non-voter replica with FAILED health status may be evicted if the
-  //   number of voter replicas in good health without the 'replace' attribute
-  //   is greater than or equal to a strict majority of voter replicas. The idea
-  //   is to avoid polluting available tablet servers with failed non-voter
-  //   replicas, while replacing failed non-voters with healthy non-voters as
-  //   aggressively as possible. Also, we want to be sure that an eviction can
-  //   succeed before initiating it.
+  // * A non-voter replica with FAILED or FAILED_UNRECOVERABLE health status
+  //   may be evicted if the number of voter replicas in good health without
+  //   the 'replace' attribute is greater than or equal to a strict majority
+  //   of voter replicas. The idea is to avoid polluting available tablet
+  //   servers with failed non-voter replicas, while replacing failed non-voters
+  //   with healthy non-voters as aggressively as possible. Also, we want to be
+  //   sure that an eviction can succeed before initiating it.
   //
   // * A voter replica may be evicted regardless of its health status
   //   if after the eviction the number of voter replicas in good health will be
   //   greater than or equal to the required replication factor and the leader
-  //   replica itself is not marked with the 'replace' attribute set. The latter
+  //   replica itself is not marked with the 'replace' attribute. The latter
   //   part of the condition emerges from the following observations:
   //     ** By definition, a voter replica marked with the 'replace' attribute
   //        should be eventually evicted from the Raft group.
@@ -633,6 +680,10 @@ bool ShouldEvictReplica(const RaftConfigPB& config,
   //   'replace' attribute is greater than or equal to a strict majority of
   //   voter replicas.
   //
+  // * A voter replica with FAILED_UNRECOVERABLE health may be evicted when
+  //   the number of *other* voter replicas in good health without the 'replace'
+  //   attribute is greater than or equal to a strict majority of voter replicas.
+  //
   // * A voter replica in good health marked with the 'replace' attribute may be
   //   evicted when the number of replicas in good health after the eviction
   //   is greater than or equal to the required replication factor.
@@ -649,7 +700,9 @@ bool ShouldEvictReplica(const RaftConfigPB& config,
   // This is to avoid polluting tablet servers with failed replicas. Otherwise,
   // it may be a situation when it's impossible to add a new non-voter replica
   // to replace failed ones.
-  need_to_evict_non_voter |= !non_voter_failed.empty();
+  need_to_evict_non_voter |=
+      has_non_voter_failed ||
+      has_non_voter_failed_unrecoverable;
 
   // All the non-voter-related sub-cases are applicable only when there is at
   // least one non-voter replica and a majority of voter replicas are on-line
@@ -668,13 +721,13 @@ bool ShouldEvictReplica(const RaftConfigPB& config,
   // This is to avoid polluting tablet servers with failed replicas. Otherwise,
   // it may be a situation when it's impossible to add a new non-voter replica
   // to replace failed ones.
-  need_to_evict_voter |= !voter_failed.empty();
+  need_to_evict_voter |= (has_voter_failed || has_voter_failed_unrecoverable);
 
   // In case if we already have enough healthy replicas running, it's safe to
   // get rid of replicas with unknown health state.
   need_to_evict_voter |=
       num_voters_viable >= replication_factor &&
-      !voter_unknown_health.empty();
+      has_voter_unknown_health;
 
   // Working with the replicas marked with the 'replace' attribute:
   // the case when too many replicas are marked with the 'replace' attribute
@@ -694,50 +747,46 @@ bool ShouldEvictReplica(const RaftConfigPB& config,
       (num_voters_with_replace > 0 && num_voters_healthy > replication_factor);
 
   // The voter-related sub-cases are applicable only when the total number of
-  // voter replicas is greater than the target replication factor and a majority
-  // of voter replicas are on-line to commit the Raft configuration change.
+  // voter replicas is greater than the target replication factor or it's
+  // a non-recoverable failure; meanwhile, a majority of voter replicas should
+  // be on-line to commit the Raft configuration change.
   const bool should_evict_voter = need_to_evict_voter &&
-      num_voters_total > replication_factor &&
+      (num_voters_total > replication_factor ||
+       has_voter_failed_unrecoverable) &&
       (num_voters_healthy >= MajoritySize(num_voters_total - 1) ||
        policy == MajorityHealthPolicy::IGNORE);
 
   const bool should_evict = should_evict_non_voter || should_evict_voter;
+  // When we have the same type of failures between voters and non-voters
+  // we evict non-voters first, but if there is an irreversibly failed voter and
+  // no irreversibly failed non-voters, then we evict such the voter first.
+  // That's because a transiently failed non-voter might be back and in good
+  // shape a few moments. Also, getting rid of a irreversibly failed voter may
+  // be beneficial in case of even-number-of-voters configurations: the majority
+  // gets more chances to be actionable if other replica fails.
+  //
+  // So, the eviction priority order is:
+  //   (1) unrecoverable non_voters
+  //   (2) unrecoverable voters
+  //   (3) evictable non_voters
+  //   (4) evictable voters
   string to_evict;
-  if (should_evict_non_voter) {
-    // First try to evict an excess non-voter, if present. This might help to
-    // avoid IO load due to a tablet copy in progress.
-    //
-    // Non-voter candidates for eviction (in decreasing priority):
-    //   * failed
-    //   * in unknown health state
-    //   * any other
-    if (!non_voter_failed.empty()) {
-      to_evict = non_voter_failed;
-    } else if (!non_voter_unknown_health.empty()) {
-      to_evict = non_voter_unknown_health;
-    } else {
-      to_evict = non_voter_any;
-    }
+  if (should_evict_non_voter && has_non_voter_failed_unrecoverable) {
+    CHECK(!pq_non_voters.empty());
+    to_evict = pq_non_voters.top().first;
+  } else if (should_evict_voter && has_voter_failed_unrecoverable) {
+    CHECK(!pq_voters.empty());
+    to_evict = pq_voters.top().first;
+  } else if (should_evict_non_voter) {
+    CHECK(!pq_non_voters.empty());
+    to_evict = pq_non_voters.top().first;
   } else if (should_evict_voter) {
-    // Next try to evict an excess voter.
-    //
-    // Voter candidates for eviction (in decreasing priority):
-    //   * failed and having the attribute REPLACE set
-    //   * failed
-    //   * having the attribute REPLACE set
-    //   * in unknown health state
-    //   * any other
-    if (!voter_failed.empty()) {
-      to_evict = voter_failed;
-    } else if (!voter_replace.empty()) {
-      to_evict = voter_replace;
-    } else if (!voter_unknown_health.empty()) {
-      to_evict = voter_unknown_health;
-    } else {
-      to_evict = voter_any;
-    }
+    CHECK(!pq_voters.empty());
+    to_evict = pq_voters.top().first;
   }
 
+  DCHECK((!should_evict && to_evict.empty()) ||
+         (should_evict && !to_evict.empty()));
   if (should_evict) {
     DCHECK(!to_evict.empty());
     DCHECK_NE(leader_uuid, to_evict);

http://git-wip-us.apache.org/repos/asf/kudu/blob/a74f9a0d/src/kudu/integration-tests/raft_consensus_nonvoter-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/raft_consensus_nonvoter-itest.cc b/src/kudu/integration-tests/raft_consensus_nonvoter-itest.cc
index 8e1d08c..f7d5be9 100644
--- a/src/kudu/integration-tests/raft_consensus_nonvoter-itest.cc
+++ b/src/kudu/integration-tests/raft_consensus_nonvoter-itest.cc
@@ -1395,13 +1395,13 @@ TEST_F(RaftConsensusNonVoterITest, TabletServerIsGoneAndBack) {
 }
 
 // A two-step sceanario: first, an existing tablet replica fails because it
-// fails behind the threshold of the GCed WAL segment threshold. The catalog
-// manager should notice that and add a new non-voter replica in attempt to
-// replace the failed replica. Then, the newly added non-voter replica becomes
-// unavailable before completing the tablet copy phase. The catalog manager
-// should add a new non-voter replica to make it possible to replace the failed
-// voter replica, so eventually the tablet has appropriate number of functional
-// replicas to guarantee the tablet's replication factor.
+// falls behind the threshold of the GCed WAL segment threshold. The catalog
+// manager should notice that and evict it right away. Then it should add a new
+// non-voter replica in attempt to replace the evicted one. The newly added
+// non-voter replica becomes unavailable before completing the tablet copy phase.
+// The catalog manager should add a new non-voter replica to make it possible to
+// replace the failed voter replica, so eventually the tablet has appropriate
+// number of functional replicas to guarantee the tablet's replication factor.
 TEST_F(RaftConsensusNonVoterITest, FailedTabletCopy) {
   if (!AllowSlowTests()) {
     LOG(WARNING) << "test is skipped; set KUDU_ALLOW_SLOW_TESTS=1 to run";
@@ -1470,22 +1470,32 @@ TEST_F(RaftConsensusNonVoterITest, FailedTabletCopy) {
     int64_t orig_term;
     NO_FATALS(CauseFollowerToFallBehindLogGC(
         replica_servers, &leader_uuid, &orig_term, &follower_uuid));
+
+    // Make sure this tablet server is not to host a functional tablet replica,
+    // even if the system tries to place one there after evicting current one.
+    ExternalTabletServer* ts =
+        cluster_->tablet_server_by_uuid(follower_uuid);
+    ASSERT_NE(nullptr, ts);
+    ASSERT_OK(cluster_->SetFlag(ts,
+                                "tablet_copy_fault_crash_on_fetch_all", "1.0"));
   }
 
   // The leader replica marks the non-responsive replica as failed after it
   // realizes the replica would not be able to catch up, and the catalog
-  // manager should add a new non-voter as a replacement.
+  // manager evicts the failed replica right away since it failed in an
+  // unrecoverable way.
   bool has_leader = false;
   TabletLocationsPB tablet_locations;
   ASSERT_OK(WaitForReplicasReportedToMaster(cluster_->master_proxy(),
-                                            kReplicasNum + 1,
+                                            kReplicasNum - 1,
                                             tablet_id_,
                                             kTimeout,
                                             WAIT_FOR_LEADER,
-                                            ANY_REPLICA,
+                                            VOTER_REPLICA,
                                             &has_leader,
                                             &tablet_locations));
 
+  // The system should add a new non-voter as a replacement.
   // However, the tablet server with the new non-voter replica crashes during
   // the tablet copy phase. Give the catalog manager some time to detect that
   // and purge the failed non-voter from the configuration. Also, the TS manager
@@ -1507,13 +1517,14 @@ TEST_F(RaftConsensusNonVoterITest, FailedTabletCopy) {
     // have changed in between of these two calls.
     ASSERT_OK(GetConsensusState(leader, tablet_id_, kTimeout, &cstate));
   });
-  // The original voter replica that fell behind WAL catchup threshold should
-  // still be there.
-  EXPECT_TRUE(IsRaftConfigMember(follower_uuid, cstate.committed_config()))
+  // The original voter replica that fell behind the WAL catchup threshold should
+  // not be there, it should be evicted.
+  EXPECT_FALSE(IsRaftConfigMember(follower_uuid, cstate.committed_config()))
       << pb_util::SecureDebugString(cstate.committed_config())
       << "fell behind WAL replica UUID: " << follower_uuid;
-  // The first non-voter replica failed on tablet copy should be evicted.
-  EXPECT_FALSE(IsRaftConfigMember(ts0->uuid(), cstate.committed_config()))
+  // The first non-voter replica failed on tablet copy cannot be evicted
+  // because no replacement replica is available at this point yet.
+  EXPECT_TRUE(IsRaftConfigMember(ts0->uuid(), cstate.committed_config()))
       << pb_util::SecureDebugString(cstate.committed_config())
       << "failed tablet copy replica UUID: " << ts0->uuid();
   // No replicas from the second tablet server should be in the config yet.

http://git-wip-us.apache.org/repos/asf/kudu/blob/a74f9a0d/src/kudu/integration-tests/ts_tablet_manager-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/ts_tablet_manager-itest.cc b/src/kudu/integration-tests/ts_tablet_manager-itest.cc
index 8efebe9..b57f986 100644
--- a/src/kudu/integration-tests/ts_tablet_manager-itest.cc
+++ b/src/kudu/integration-tests/ts_tablet_manager-itest.cc
@@ -60,6 +60,7 @@
 #include "kudu/util/test_util.h"
 
 DECLARE_bool(allow_unsafe_replication_factor);
+DECLARE_bool(catalog_manager_evict_excess_replicas);
 DECLARE_bool(catalog_manager_wait_for_new_tablets_to_elect_leader);
 DECLARE_bool(enable_leader_failure_detection);
 DECLARE_bool(raft_prepare_replacement_before_eviction);
@@ -344,6 +345,7 @@ TEST_F(TsTabletManagerITest, ReportOnReplicaHealthStatus) {
 
   // This test is specific to the 3-4-3 replica management scheme.
   FLAGS_raft_prepare_replacement_before_eviction = true;
+  FLAGS_catalog_manager_evict_excess_replicas = false;
   {
     InternalMiniClusterOptions opts;
     opts.num_tablet_servers = kNumReplicas;
@@ -500,7 +502,7 @@ TEST_F(TsTabletManagerITest, ReportOnReplicaHealthStatus) {
       const auto& replica_uuid = e.first;
       SCOPED_TRACE("replica UUID: " + replica_uuid);
       if (replica_uuid == failed_replica_uuid) {
-        ASSERT_EQ(HealthReportPB::FAILED, e.second.overall_health());
+        ASSERT_EQ(HealthReportPB::FAILED_UNRECOVERABLE, e.second.overall_health());
       } else {
         ASSERT_EQ(HealthReportPB::HEALTHY, e.second.overall_health());
       }
@@ -522,7 +524,7 @@ TEST_F(TsTabletManagerITest, ReportOnReplicaHealthStatus) {
       ASSERT_TRUE(peer.has_health_report());
       const HealthReportPB& report(peer.health_report());
       if (uuid == failed_replica_uuid) {
-        EXPECT_EQ(HealthReportPB::FAILED, report.overall_health());
+        EXPECT_EQ(HealthReportPB::FAILED_UNRECOVERABLE, report.overall_health());
       } else {
         EXPECT_EQ(HealthReportPB::HEALTHY, report.overall_health());
       }


Mime
View raw message