geode-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From bschucha...@apache.org
Subject [04/50] [abbrv] incubator-geode git commit: GEODE-77: deadlock in GMSJoinLeave
Date Fri, 20 Nov 2015 21:01:58 GMT
GEODE-77: deadlock in GMSJoinLeave

This removes the stateLock read-write lock in favor of using a
sync on viewInstallationLock, eliminating the possibility of
inversion between the two.


Project: http://git-wip-us.apache.org/repos/asf/incubator-geode/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-geode/commit/f3b1f1b1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-geode/tree/f3b1f1b1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-geode/diff/f3b1f1b1

Branch: refs/heads/develop
Commit: f3b1f1b17f69dc9d21a846971fbe26d813e9df0a
Parents: 0f7daf2
Author: Bruce Schuchardt <bschuchardt@pivotal.io>
Authored: Tue Oct 20 11:26:26 2015 -0700
Committer: Bruce Schuchardt <bschuchardt@pivotal.io>
Committed: Tue Oct 20 11:26:26 2015 -0700

----------------------------------------------------------------------
 .../membership/gms/fd/GMSHealthMonitor.java     |   5 +
 .../membership/gms/membership/GMSJoinLeave.java | 159 +++++++++++--------
 .../gms/messenger/JGroupsMessenger.java         |  22 ++-
 .../gms/mgr/GMSMembershipManager.java           |   3 +-
 .../gms/membership/GMSJoinLeaveJUnitTest.java   |   8 +-
 .../management/ClientHealthStatsDUnitTest.java  |   1 +
 6 files changed, 118 insertions(+), 80 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/f3b1f1b1/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
----------------------------------------------------------------------
diff --git a/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
b/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
index 9d87014..6f7cb6b 100755
--- a/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
+++ b/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
@@ -284,6 +284,9 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler
{
    * @return
    */
   private boolean doCheckMember(InternalDistributedMember pingMember) {
+    if (playingDead) {
+      return true;
+    }
     //TODO: need to some tcp check
     logger.trace("Checking member {}", pingMember);
     final CheckRequestMessage prm = constructCheckRequestMessage(pingMember);
@@ -543,10 +546,12 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler
{
   @Override
   public void beSick() {
     this.beingSick = true;
+    sendSuspectMessage(localAddress, "beSick invoked on GMSHealthMonitor");
   }
 
   @Override
   public void playDead() {
+    sendSuspectMessage(localAddress, "playDead invoked on GMSHealthMonitor");
     this.playingDead = true;
   }
 

http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/f3b1f1b1/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/membership/GMSJoinLeave.java
----------------------------------------------------------------------
diff --git a/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/membership/GMSJoinLeave.java
b/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/membership/GMSJoinLeave.java
index 4ebc20c..3f85cda 100755
--- a/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/membership/GMSJoinLeave.java
+++ b/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/membership/GMSJoinLeave.java
@@ -103,16 +103,13 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
   /** have I connected to the distributed system? */
   private volatile boolean isJoined;
 
-  /** a lock governing GMS state */
-  private ReadWriteLock stateLock = new ReentrantReadWriteLock();
-  
-  /** guarded by stateLock */
+  /** guarded by viewInstallationLock */
   private boolean isCoordinator;
   
   /** a synch object that guards view installation */
   private final Object viewInstallationLock = new Object();
   
-  /** the currently installed view */
+  /** the currently installed view.  Guarded by viewInstallationLock */
   private volatile NetView currentView;
   
   /** the previous view **/
@@ -196,7 +193,9 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
     
     try {
       if (Boolean.getBoolean(BYPASS_DISCOVERY)) {
-        becomeCoordinator();
+        synchronized(viewInstallationLock) {
+          becomeCoordinator();
+        }
         return true;
       }
       
@@ -216,7 +215,9 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
           if (localAddress.getNetMember().preferredForCoordinator()
               && state.possibleCoordinator.equals(this.localAddress)) {
             if (tries > 2 || System.currentTimeMillis() < giveupTime ) {
-              becomeCoordinator();
+              synchronized(viewInstallationLock) {
+                becomeCoordinator();
+              }
               return true;
             }
           } else {
@@ -315,8 +316,10 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
       if (response.getCurrentView() != null) {
         if (response.getBecomeCoordinator()) {
           logger.info("I am being told to become the membership coordinator by {}", coord);
-          this.currentView = response.getCurrentView();
-          becomeCoordinator(null);
+          synchronized(viewInstallationLock) {
+            this.currentView = response.getCurrentView();
+            becomeCoordinator(null);
+          }
         } else {
           this.birthViewId = response.getMemberID().getVmViewId();
           this.localAddress.setVmViewId(this.birthViewId);
@@ -420,7 +423,9 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
         check.addCrashedMembers(removedMembers);
       }
       if (check.getCoordinator().equals(localAddress)) {
-        becomeCoordinator(incomingRequest.getMemberID());
+        synchronized(viewInstallationLock) {
+          becomeCoordinator(incomingRequest.getMemberID());
+        }
       }
     }
     else {
@@ -475,7 +480,9 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
         check.removeAll(removedMembers);
       }
       if (check.getCoordinator().equals(localAddress)) {
-        becomeCoordinator(mbr);
+        synchronized(viewInstallationLock) {
+          becomeCoordinator(mbr);
+        }
       }
     }
     else {
@@ -515,65 +522,75 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
     becomeCoordinator(null);
   }
   
+  
+  public void becomeCoordinatorForTest() {
+    synchronized(viewInstallationLock) {
+      becomeCoordinator();
+    }
+  }
+  
   /**
+   * Transitions this member into the coordinator role.  This must
+   * be invoked under a synch on viewInstallationLock that was held
+   * at the time the decision was made to become coordinator so that
+   * the decision is atomic with actually becoming coordinator.
    * @param oldCoordinator may be null
    */
   private void becomeCoordinator(InternalDistributedMember oldCoordinator) {
     boolean testing = unitTesting.contains("noRandomViewChange");
-    stateLock.writeLock().lock();
-    try {
-      if (isCoordinator) {
-        return;
+
+    assert Thread.holdsLock(viewInstallationLock);
+    
+    if (isCoordinator) {
+      return;
+    }
+    
+    logger.info("This member is becoming the membership coordinator with address {}", localAddress);
+    isCoordinator = true;
+    if (currentView == null) {
+      // create the initial membership view
+      NetView newView = new NetView(this.localAddress);
+      this.localAddress.setVmViewId(0);
+      installView(newView);
+      isJoined = true;
+      if (viewCreator == null || viewCreator.isShutdown()) {
+        viewCreator = new ViewCreator("GemFire Membership View Creator", Services.getThreadGroup());
+        viewCreator.setDaemon(true);
+        viewCreator.start();
       }
-      logger.info("This member is becoming the membership coordinator with address {}", localAddress);
-      isCoordinator = true;
-      if (currentView == null) {
-        // create the initial membership view
-        NetView newView = new NetView(this.localAddress);
-        this.localAddress.setVmViewId(0);
-        installView(newView);
-        isJoined = true;
-        if (viewCreator == null || viewCreator.isShutdown()) {
-          viewCreator = new ViewCreator("GemFire Membership View Creator", Services.getThreadGroup());
-          viewCreator.setDaemon(true);
-          viewCreator.start();
+    } else {
+      // create and send out a new view
+      NetView newView;
+      Set<InternalDistributedMember> leaving = new HashSet<>();
+      Set<InternalDistributedMember> removals;
+      synchronized(viewInstallationLock) {
+        int rand = testing? 0 : NetView.RANDOM.nextInt(10);
+        int viewNumber = currentView.getViewId() + 5 + rand;
+        if (this.localAddress.getVmViewId() < 0) {
+          this.localAddress.setVmViewId(viewNumber);
         }
-      } else {
-        // create and send out a new view
-        NetView newView;
-        Set<InternalDistributedMember> leaving = new HashSet<>();
-        Set<InternalDistributedMember> removals;
-        synchronized(viewInstallationLock) {
-          int rand = testing? 0 : NetView.RANDOM.nextInt(10);
-          int viewNumber = currentView.getViewId() + 5 + rand;
-          if (this.localAddress.getVmViewId() < 0) {
-            this.localAddress.setVmViewId(viewNumber);
-          }
 
-          List<InternalDistributedMember> mbrs = new ArrayList<>(currentView.getMembers());
-          if (!mbrs.contains(localAddress)) {
-            mbrs.add(localAddress);
-          }
-          synchronized(this.removedMembers) {
-            removals = new HashSet<>(this.removedMembers);
-          }
-          if (oldCoordinator != null && !removals.contains(oldCoordinator)) {
-            leaving.add(oldCoordinator);
-          }
-          mbrs.removeAll(removals);
-          mbrs.removeAll(leaving);
-          newView = new NetView(this.localAddress, viewNumber, mbrs, leaving,
-              removals);
+        List<InternalDistributedMember> mbrs = new ArrayList<>(currentView.getMembers());
+        if (!mbrs.contains(localAddress)) {
+          mbrs.add(localAddress);
+        }
+        synchronized(this.removedMembers) {
+          removals = new HashSet<>(this.removedMembers);
         }
-        if (viewCreator == null || viewCreator.isShutdown()) {
-          viewCreator = new ViewCreator("GemFire Membership View Creator", Services.getThreadGroup());
-          viewCreator.setInitialView(newView, leaving, removals);
-          viewCreator.setDaemon(true);
-          viewCreator.start();
+        if (oldCoordinator != null && !removals.contains(oldCoordinator)) {
+          leaving.add(oldCoordinator);
         }
+        mbrs.removeAll(removals);
+        mbrs.removeAll(leaving);
+        newView = new NetView(this.localAddress, viewNumber, mbrs, leaving,
+            removals);
+      }
+      if (viewCreator == null || viewCreator.isShutdown()) {
+        viewCreator = new ViewCreator("GemFire Membership View Creator", Services.getThreadGroup());
+        viewCreator.setInitialView(newView, leaving, removals);
+        viewCreator.setDaemon(true);
+        viewCreator.start();
       }
-    } finally {
-      stateLock.writeLock().unlock();
     }
   }
   
@@ -1016,13 +1033,8 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
           becomeCoordinator();
         } else if (this.isCoordinator) {
           // stop being coordinator
-          stateLock.writeLock().lock();
-          try {
-            stopCoordinatorServices();
-            this.isCoordinator = false;
-          } finally {
-            stateLock.writeLock().unlock();
-          }
+          stopCoordinatorServices();
+          this.isCoordinator = false;
         }
       }
       if (!this.isCoordinator) {
@@ -1899,15 +1911,24 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
       try {
         List<Future<InternalDistributedMember>> futures;
         futures = svc.invokeAll(checkers);
-
+        long giveUpTime = System.currentTimeMillis() + viewAckTimeout;
         for (Future<InternalDistributedMember> future: futures) {
+          long now = System.currentTimeMillis();
           try {
-            InternalDistributedMember mbr = future.get(viewAckTimeout, TimeUnit.MILLISECONDS);
+            InternalDistributedMember mbr = null;
+            long timeToWait = giveUpTime - now;
+            if (timeToWait <= 0) {
+              // TODO if timeToWait==0 is future.get() guaranteed to return immediately?
+              // It looks like some code paths invoke Object.wait(0), which waits forever.
+              timeToWait = 1;
+            }
+            mbr = future.get(timeToWait, TimeUnit.MILLISECONDS);
             if (mbr != null) {
               mbrs.remove(mbr);
             }
           } catch (java.util.concurrent.TimeoutException e) {
-            // TODO should the member be removed if we can't verify it in time?
+            // timeout - member didn't pass the final check and will not be removed
+            // from the collection of members
           } catch (ExecutionException e) {
             logger.info("unexpected exception caught during member verification", e);
           }

http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/f3b1f1b1/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
----------------------------------------------------------------------
diff --git a/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
b/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
index 7f6a40e..2afc1f8 100755
--- a/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
+++ b/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
@@ -2,6 +2,7 @@ package com.gemstone.gemfire.distributed.internal.membership.gms.messenger;
 
 import static com.gemstone.gemfire.distributed.internal.membership.gms.GMSUtil.replaceStrings;
 import static com.gemstone.gemfire.internal.DataSerializableFixedID.JOIN_RESPONSE;
+import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap;
 
 import java.io.BufferedReader;
 import java.io.ByteArrayInputStream;
@@ -63,7 +64,6 @@ import com.gemstone.gemfire.distributed.internal.membership.gms.Services;
 import com.gemstone.gemfire.distributed.internal.membership.gms.interfaces.MessageHandler;
 import com.gemstone.gemfire.distributed.internal.membership.gms.interfaces.Messenger;
 import com.gemstone.gemfire.distributed.internal.membership.gms.messages.JoinResponseMessage;
-import com.gemstone.gemfire.distributed.internal.membership.gms.mgr.GMSMembershipManager;
 import com.gemstone.gemfire.internal.ClassPathLoader;
 import com.gemstone.gemfire.internal.HeapDataOutputStream;
 import com.gemstone.gemfire.internal.OSProcess;
@@ -77,8 +77,6 @@ import com.gemstone.gemfire.internal.i18n.LocalizedStrings;
 import com.gemstone.gemfire.internal.logging.log4j.LocalizedMessage;
 import com.gemstone.gemfire.internal.tcp.MemberShunnedException;
 
-import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap;
-
 public class JGroupsMessenger implements Messenger {
 
   private static final Logger logger = Services.getLogger();
@@ -124,9 +122,9 @@ public class JGroupsMessenger implements Messenger {
   
   private GMSPingPonger pingPonger = new GMSPingPonger();
   
-  private volatile long pingsReceived;
-  
   private volatile long pongsReceived;
+  
+  private boolean playingDead;
 
   
   static {
@@ -431,10 +429,12 @@ public class JGroupsMessenger implements Messenger {
 
   @Override
   public void playDead() {
+    playingDead = true;
   }
 
   @Override
   public void beHealthy() {
+    playingDead = false;
   }
 
   @Override
@@ -477,6 +477,17 @@ public class JGroupsMessenger implements Messenger {
       throw new DistributedSystemDisconnectedException("Distributed System is shutting down");
     }
     
+    if (playingDead) {
+      Set result = new HashSet<>();
+      InternalDistributedMember[] rec = msg.getRecipients();
+      if (rec != null) {
+        for (int i=0; i<rec.length; i++) {
+          result.add(rec[i]);
+        }
+      }
+      return result;
+    }
+    
     filterOutgoingMessage(msg);
     
     InternalDistributedMember[] destinations = msg.getRecipients();
@@ -877,7 +888,6 @@ public class JGroupsMessenger implements Messenger {
       //Respond to ping messages sent from other systems that are in a auto reconnect state
       byte[] contents = jgmsg.getBuffer();
       if (pingPonger.isPingMessage(contents)) {
-        pingsReceived++;
         try {
           pingPonger.sendPongMessage(myChannel, jgAddress, jgmsg.getSrc());
         }

http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/f3b1f1b1/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/mgr/GMSMembershipManager.java
----------------------------------------------------------------------
diff --git a/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/mgr/GMSMembershipManager.java
b/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/mgr/GMSMembershipManager.java
index 50dc99e..a102ac2 100755
--- a/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/mgr/GMSMembershipManager.java
+++ b/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/mgr/GMSMembershipManager.java
@@ -2894,6 +2894,8 @@ public class GMSMembershipManager implements MembershipManager, Manager
     services.setShutdownCause(shutdownCause);
     services.getCancelCriterion().cancel(reason);
 
+    AlertAppender.getInstance().shuttingDown();
+
     if (!inhibitForceDisconnectLogging) {
       logger.fatal(LocalizedMessage.create(
           LocalizedStrings.GroupMembershipService_MEMBERSHIP_SERVICE_FAILURE_0, reason),
shutdownCause);
@@ -2903,7 +2905,6 @@ public class GMSMembershipManager implements MembershipManager, Manager
       saveCacheXmlForReconnect();
     }
     
-    AlertAppender.getInstance().shuttingDown();
 
     Thread reconnectThread = new Thread (new Runnable() {
       public void run() {

http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/f3b1f1b1/gemfire-core/src/test/java/com/gemstone/gemfire/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
----------------------------------------------------------------------
diff --git a/gemfire-core/src/test/java/com/gemstone/gemfire/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
b/gemfire-core/src/test/java/com/gemstone/gemfire/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
index d29a6dd..42594cf 100644
--- a/gemfire-core/src/test/java/com/gemstone/gemfire/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
+++ b/gemfire-core/src/test/java/com/gemstone/gemfire/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
@@ -292,7 +292,7 @@ public class GMSJoinLeaveJUnitTest {
     gmsJoinLeave.unitTesting.add("noRandomViewChange");
     prepareAndInstallView();
     gmsJoinLeave.getView().add(gmsJoinLeaveMemberId);
-    gmsJoinLeave.becomeCoordinator();
+    gmsJoinLeave.becomeCoordinatorForTest();
 
     LeaveRequestMessage msg = new LeaveRequestMessage(gmsJoinLeave.getMemberID(), mockMembers[0],
reason);
     msg.setSender(mockMembers[0]);
@@ -318,7 +318,7 @@ public class GMSJoinLeaveJUnitTest {
     gmsJoinLeave.getView().add(gmsJoinLeaveMemberId);
     gmsJoinLeave.getView().add(mockMembers[1]);
     gmsJoinLeave.unitTesting.add("noRandomViewChange");
-    gmsJoinLeave.becomeCoordinator();
+    gmsJoinLeave.becomeCoordinatorForTest();
     RemoveMemberMessage msg = new RemoveMemberMessage(gmsJoinLeave.getMemberID(), mockMembers[0],
reason);
     msg.setSender(mockMembers[0]);
     gmsJoinLeave.processMessage(msg);
@@ -345,7 +345,7 @@ public class GMSJoinLeaveJUnitTest {
     prepareAndInstallView();
     gmsJoinLeave.getView().add(gmsJoinLeaveMemberId);
     gmsJoinLeave.getView().add(mockMembers[1]);
-    gmsJoinLeave.becomeCoordinator();
+    gmsJoinLeave.becomeCoordinatorForTest();
     JoinRequestMessage msg = new JoinRequestMessage(gmsJoinLeaveMemberId, mockMembers[2],
null);
     msg.setSender(mockMembers[2]);
     gmsJoinLeave.processMessage(msg);
@@ -473,7 +473,7 @@ public class GMSJoinLeaveJUnitTest {
     NetView oldView = gmsJoinLeave.getView();
     oldView.add(gmsJoinLeaveMemberId);
     InternalDistributedMember creator = oldView.getCreator();
-    gmsJoinLeave.becomeCoordinator();
+    gmsJoinLeave.becomeCoordinatorForTest();
     NetView view = new NetView(2, gmsJoinLeave.getView().getViewId()+1);
     view.setCreator(creator);
     view.add(creator);

http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/f3b1f1b1/gemfire-core/src/test/java/com/gemstone/gemfire/management/ClientHealthStatsDUnitTest.java
----------------------------------------------------------------------
diff --git a/gemfire-core/src/test/java/com/gemstone/gemfire/management/ClientHealthStatsDUnitTest.java
b/gemfire-core/src/test/java/com/gemstone/gemfire/management/ClientHealthStatsDUnitTest.java
index 891cb60..ab9d8ba 100644
--- a/gemfire-core/src/test/java/com/gemstone/gemfire/management/ClientHealthStatsDUnitTest.java
+++ b/gemfire-core/src/test/java/com/gemstone/gemfire/management/ClientHealthStatsDUnitTest.java
@@ -94,6 +94,7 @@ public class ClientHealthStatsDUnitTest extends DistributedTestCase {
     server = host.getVM(1);
     client = host.getVM(2);
     client2 = host.getVM(3);
+    addExpectedException("Connection reset");
   }
 
   public void tearDown2() throws Exception {


Mime
View raw message