geode-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From bschucha...@apache.org
Subject [geode] branch develop updated: GEODE-6244 Healthy member kicked out by Sick member when final-check fails
Date Mon, 04 Feb 2019 17:41:42 GMT
This is an automated email from the ASF dual-hosted git repository.

bschuchardt pushed a commit to branch develop
in repository https://gitbox.apache.org/repos/asf/geode.git


The following commit(s) were added to refs/heads/develop by this push:
     new f8c69d2  GEODE-6244 Healthy member kicked out by Sick member when final-check fails
f8c69d2 is described below

commit f8c69d2b647edf7b3e9f93446a39e381fe3b70d9
Author: Bruce Schuchardt <bschuchardt@pivotal.io>
AuthorDate: Mon Feb 4 09:40:37 2019 -0800

    GEODE-6244 Healthy member kicked out by Sick member when final-check fails
    
    The initial fix caused a problem that prevented election of a new
    membership coordinator in a certain case.  The case was a view
    with nodes [A, B, C, D, E] where C was the coordinator.  Node A had
    crashed and the crash had been detected by B.  Node C then left the
    cluster, sending a Leave message to B.  B's JoinLeave did not know about
    the HealthMonitor's decision that A was crashed and did not become the
    new coordinator.
    
    This commit makes B's JoinLeave pay attention to the crashed-member set
    in the HealthMonitor when deciding whether to become the membership
    coordinator for the cluster.
---
 .../gms/membership/GMSJoinLeaveJUnitTest.java        | 20 ++++++++++++++++++++
 .../internal/membership/gms/fd/GMSHealthMonitor.java | 12 +++++++++++-
 .../membership/gms/interfaces/HealthMonitor.java     |  7 +++++++
 .../membership/gms/membership/GMSJoinLeave.java      |  6 ++++++
 4 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
index a503809..f59f677 100644
--- a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
+++ b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
@@ -733,6 +733,26 @@ public class GMSJoinLeaveJUnitTest {
     assertTrue("Expected becomeCoordinator to be invoked", gmsJoinLeave.isCoordinator());
   }
 
+  /**
+   * Given a view with [A, B, C, D, E] where C is coordinator, A failed availability checks
and
+   * C shuts down we should see B become the coordinator.
+   */
+  @Test
+  public void testBecomeCoordinatorThroughShutdownWhenOlderMemberCrashed() throws Exception
{
+    initMocks();
+    InternalDistributedMember A = mockMembers[0],
+        B = gmsJoinLeaveMemberId,
+        C = mockMembers[1],
+        D = mockMembers[2],
+        E = mockMembers[3];
+    prepareAndInstallView(C, createMemberList(A, B, C, D, E));
+    when(healthMonitor.getMembersFailingAvailabilityCheck()).thenReturn(Collections.singleton(A));
+    LeaveRequestMessage msg = new LeaveRequestMessage(B, C, "leaving for test");
+    msg.setSender(C);
+    gmsJoinLeave.processMessage(msg);
+    assertTrue("Expected becomeCoordinator to be invoked", gmsJoinLeave.isCoordinator());
+  }
+
   @Test
   public void testBecomeCoordinatorThroughViewChange() throws Exception {
     initMocks();
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
index cf6e9e5..d880501 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
@@ -842,12 +842,17 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler
{
       }
       InternalDistributedMember oldNeighbor = nextNeighbor;
       if (oldNeighbor != newNeighbor) {
-        logger.info("Failure detection is now watching {}", newNeighbor);
+        logger.info("Failure detection is now watching " + newNeighbor
+            + "; suspects are " + suspectedMemberIds);
         nextNeighbor = newNeighbor;
       }
     }
 
     if (nextNeighbor != null && nextNeighbor.equals(localAddress)) {
+      if (logger.isDebugEnabled()) {
+        logger.debug("Health monitor is unable to find a neighbor to watch.  "
+            + "Current suspects are {}", suspectedMemberIds);
+      }
       nextNeighbor = null;
     }
 
@@ -1355,6 +1360,11 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler
{
     return this.socketPort;
   }
 
+  @Override
+  public Collection<InternalDistributedMember> getMembersFailingAvailabilityCheck()
{
+    return Collections.unmodifiableCollection(this.suspectedMemberIds.keySet());
+  }
+
   private void sendSuspectRequest(final List<SuspectRequest> requests) {
     logger.debug("Sending suspect request for members {}", requests);
     List<InternalDistributedMember> recipients;
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/HealthMonitor.java
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/HealthMonitor.java
index abd7a66..1975186 100755
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/HealthMonitor.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/HealthMonitor.java
@@ -14,6 +14,8 @@
  */
 package org.apache.geode.distributed.internal.membership.gms.interfaces;
 
+import java.util.Collection;
+
 import org.apache.geode.distributed.DistributedMember;
 import org.apache.geode.distributed.internal.membership.InternalDistributedMember;
 
@@ -51,4 +53,9 @@ public interface HealthMonitor extends Service {
    */
   int getFailureDetectionPort();
 
+  /**
+   * Returns the set of members declared dead by the health monitor
+   */
+  Collection<InternalDistributedMember> getMembersFailingAvailabilityCheck();
+
 }
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
index 6046827..75ee997 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
@@ -608,6 +608,12 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
         leftMembers.add(mbr);
         check.removeAll(leftMembers);
       }
+      Collection<InternalDistributedMember> suspectMembers =
+          services.getHealthMonitor().getMembersFailingAvailabilityCheck();
+      check.removeAll(suspectMembers);
+      logger.info(
+          "View with removed and left members removed is {}\nremoved members: {}\nleft members:
{}\nsuspect members: {}",
+          check, removedMembers, leftMembers, suspectMembers);
       if (check.getCoordinator().equals(localAddress)) {
         synchronized (viewInstallationLock) {
           becomeCoordinator(mbr);


Mime
View raw message