Return-Path: X-Original-To: apmail-geode-commits-archive@minotaur.apache.org Delivered-To: apmail-geode-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id F03F3186B4 for ; Fri, 20 Nov 2015 21:02:10 +0000 (UTC) Received: (qmail 8449 invoked by uid 500); 20 Nov 2015 21:02:10 -0000 Delivered-To: apmail-geode-commits-archive@geode.apache.org Received: (qmail 8420 invoked by uid 500); 20 Nov 2015 21:02:10 -0000 Mailing-List: contact commits-help@geode.incubator.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@geode.incubator.apache.org Delivered-To: mailing list commits@geode.incubator.apache.org Received: (qmail 8411 invoked by uid 99); 20 Nov 2015 21:02:10 -0000 Received: from Unknown (HELO spamd2-us-west.apache.org) (209.188.14.142) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 20 Nov 2015 21:02:10 +0000 Received: from localhost (localhost [127.0.0.1]) by spamd2-us-west.apache.org (ASF Mail Server at spamd2-us-west.apache.org) with ESMTP id 6B2921A24ED for ; Fri, 20 Nov 2015 21:02:10 +0000 (UTC) X-Virus-Scanned: Debian amavisd-new at spamd2-us-west.apache.org X-Spam-Flag: NO X-Spam-Score: 1.771 X-Spam-Level: * X-Spam-Status: No, score=1.771 tagged_above=-999 required=6.31 tests=[KAM_ASCII_DIVIDERS=0.8, KAM_LAZY_DOMAIN_SECURITY=1, RCVD_IN_MSPIKE_H3=-0.01, RCVD_IN_MSPIKE_WL=-0.01, T_RP_MATCHES_RCVD=-0.01, URIBL_BLOCKED=0.001] autolearn=disabled Received: from mx1-eu-west.apache.org ([10.40.0.8]) by localhost (spamd2-us-west.apache.org [10.40.0.9]) (amavisd-new, port 10024) with ESMTP id hilRAFJTQ1rX for ; Fri, 20 Nov 2015 21:01:58 +0000 (UTC) Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by mx1-eu-west.apache.org (ASF Mail Server at mx1-eu-west.apache.org) with SMTP id 9ED47211D3 for ; Fri, 20 Nov 2015 21:01:56 +0000 (UTC) Received: (qmail 7140 invoked by uid 99); 20 Nov 2015 21:01:55 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 20 Nov 2015 21:01:55 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id B259FE33F2; Fri, 20 Nov 2015 21:01:55 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: bschuchardt@apache.org To: commits@geode.incubator.apache.org Date: Fri, 20 Nov 2015 21:01:58 -0000 Message-Id: In-Reply-To: References: X-Mailer: ASF-Git Admin Mailer Subject: [04/50] [abbrv] incubator-geode git commit: GEODE-77: deadlock in GMSJoinLeave GEODE-77: deadlock in GMSJoinLeave This removes the stateLock read-write lock in favor of using a sync on viewInstallationLock, eliminating the possibility of inversion between the two. Project: http://git-wip-us.apache.org/repos/asf/incubator-geode/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-geode/commit/f3b1f1b1 Tree: http://git-wip-us.apache.org/repos/asf/incubator-geode/tree/f3b1f1b1 Diff: http://git-wip-us.apache.org/repos/asf/incubator-geode/diff/f3b1f1b1 Branch: refs/heads/develop Commit: f3b1f1b17f69dc9d21a846971fbe26d813e9df0a Parents: 0f7daf2 Author: Bruce Schuchardt Authored: Tue Oct 20 11:26:26 2015 -0700 Committer: Bruce Schuchardt Committed: Tue Oct 20 11:26:26 2015 -0700 ---------------------------------------------------------------------- .../membership/gms/fd/GMSHealthMonitor.java | 5 + .../membership/gms/membership/GMSJoinLeave.java | 159 +++++++++++-------- .../gms/messenger/JGroupsMessenger.java | 22 ++- .../gms/mgr/GMSMembershipManager.java | 3 +- .../gms/membership/GMSJoinLeaveJUnitTest.java | 8 +- .../management/ClientHealthStatsDUnitTest.java | 1 + 6 files changed, 118 insertions(+), 80 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/f3b1f1b1/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/fd/GMSHealthMonitor.java ---------------------------------------------------------------------- diff --git a/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/fd/GMSHealthMonitor.java b/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/fd/GMSHealthMonitor.java index 9d87014..6f7cb6b 100755 --- a/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/fd/GMSHealthMonitor.java +++ b/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/fd/GMSHealthMonitor.java @@ -284,6 +284,9 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler { * @return */ private boolean doCheckMember(InternalDistributedMember pingMember) { + if (playingDead) { + return true; + } //TODO: need to some tcp check logger.trace("Checking member {}", pingMember); final CheckRequestMessage prm = constructCheckRequestMessage(pingMember); @@ -543,10 +546,12 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler { @Override public void beSick() { this.beingSick = true; + sendSuspectMessage(localAddress, "beSick invoked on GMSHealthMonitor"); } @Override public void playDead() { + sendSuspectMessage(localAddress, "playDead invoked on GMSHealthMonitor"); this.playingDead = true; } http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/f3b1f1b1/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/membership/GMSJoinLeave.java ---------------------------------------------------------------------- diff --git a/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/membership/GMSJoinLeave.java b/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/membership/GMSJoinLeave.java index 4ebc20c..3f85cda 100755 --- a/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/membership/GMSJoinLeave.java +++ b/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/membership/GMSJoinLeave.java @@ -103,16 +103,13 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler { /** have I connected to the distributed system? */ private volatile boolean isJoined; - /** a lock governing GMS state */ - private ReadWriteLock stateLock = new ReentrantReadWriteLock(); - - /** guarded by stateLock */ + /** guarded by viewInstallationLock */ private boolean isCoordinator; /** a synch object that guards view installation */ private final Object viewInstallationLock = new Object(); - /** the currently installed view */ + /** the currently installed view. Guarded by viewInstallationLock */ private volatile NetView currentView; /** the previous view **/ @@ -196,7 +193,9 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler { try { if (Boolean.getBoolean(BYPASS_DISCOVERY)) { - becomeCoordinator(); + synchronized(viewInstallationLock) { + becomeCoordinator(); + } return true; } @@ -216,7 +215,9 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler { if (localAddress.getNetMember().preferredForCoordinator() && state.possibleCoordinator.equals(this.localAddress)) { if (tries > 2 || System.currentTimeMillis() < giveupTime ) { - becomeCoordinator(); + synchronized(viewInstallationLock) { + becomeCoordinator(); + } return true; } } else { @@ -315,8 +316,10 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler { if (response.getCurrentView() != null) { if (response.getBecomeCoordinator()) { logger.info("I am being told to become the membership coordinator by {}", coord); - this.currentView = response.getCurrentView(); - becomeCoordinator(null); + synchronized(viewInstallationLock) { + this.currentView = response.getCurrentView(); + becomeCoordinator(null); + } } else { this.birthViewId = response.getMemberID().getVmViewId(); this.localAddress.setVmViewId(this.birthViewId); @@ -420,7 +423,9 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler { check.addCrashedMembers(removedMembers); } if (check.getCoordinator().equals(localAddress)) { - becomeCoordinator(incomingRequest.getMemberID()); + synchronized(viewInstallationLock) { + becomeCoordinator(incomingRequest.getMemberID()); + } } } else { @@ -475,7 +480,9 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler { check.removeAll(removedMembers); } if (check.getCoordinator().equals(localAddress)) { - becomeCoordinator(mbr); + synchronized(viewInstallationLock) { + becomeCoordinator(mbr); + } } } else { @@ -515,65 +522,75 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler { becomeCoordinator(null); } + + public void becomeCoordinatorForTest() { + synchronized(viewInstallationLock) { + becomeCoordinator(); + } + } + /** + * Transitions this member into the coordinator role. This must + * be invoked under a synch on viewInstallationLock that was held + * at the time the decision was made to become coordinator so that + * the decision is atomic with actually becoming coordinator. * @param oldCoordinator may be null */ private void becomeCoordinator(InternalDistributedMember oldCoordinator) { boolean testing = unitTesting.contains("noRandomViewChange"); - stateLock.writeLock().lock(); - try { - if (isCoordinator) { - return; + + assert Thread.holdsLock(viewInstallationLock); + + if (isCoordinator) { + return; + } + + logger.info("This member is becoming the membership coordinator with address {}", localAddress); + isCoordinator = true; + if (currentView == null) { + // create the initial membership view + NetView newView = new NetView(this.localAddress); + this.localAddress.setVmViewId(0); + installView(newView); + isJoined = true; + if (viewCreator == null || viewCreator.isShutdown()) { + viewCreator = new ViewCreator("GemFire Membership View Creator", Services.getThreadGroup()); + viewCreator.setDaemon(true); + viewCreator.start(); } - logger.info("This member is becoming the membership coordinator with address {}", localAddress); - isCoordinator = true; - if (currentView == null) { - // create the initial membership view - NetView newView = new NetView(this.localAddress); - this.localAddress.setVmViewId(0); - installView(newView); - isJoined = true; - if (viewCreator == null || viewCreator.isShutdown()) { - viewCreator = new ViewCreator("GemFire Membership View Creator", Services.getThreadGroup()); - viewCreator.setDaemon(true); - viewCreator.start(); + } else { + // create and send out a new view + NetView newView; + Set leaving = new HashSet<>(); + Set removals; + synchronized(viewInstallationLock) { + int rand = testing? 0 : NetView.RANDOM.nextInt(10); + int viewNumber = currentView.getViewId() + 5 + rand; + if (this.localAddress.getVmViewId() < 0) { + this.localAddress.setVmViewId(viewNumber); } - } else { - // create and send out a new view - NetView newView; - Set leaving = new HashSet<>(); - Set removals; - synchronized(viewInstallationLock) { - int rand = testing? 0 : NetView.RANDOM.nextInt(10); - int viewNumber = currentView.getViewId() + 5 + rand; - if (this.localAddress.getVmViewId() < 0) { - this.localAddress.setVmViewId(viewNumber); - } - List mbrs = new ArrayList<>(currentView.getMembers()); - if (!mbrs.contains(localAddress)) { - mbrs.add(localAddress); - } - synchronized(this.removedMembers) { - removals = new HashSet<>(this.removedMembers); - } - if (oldCoordinator != null && !removals.contains(oldCoordinator)) { - leaving.add(oldCoordinator); - } - mbrs.removeAll(removals); - mbrs.removeAll(leaving); - newView = new NetView(this.localAddress, viewNumber, mbrs, leaving, - removals); + List mbrs = new ArrayList<>(currentView.getMembers()); + if (!mbrs.contains(localAddress)) { + mbrs.add(localAddress); + } + synchronized(this.removedMembers) { + removals = new HashSet<>(this.removedMembers); } - if (viewCreator == null || viewCreator.isShutdown()) { - viewCreator = new ViewCreator("GemFire Membership View Creator", Services.getThreadGroup()); - viewCreator.setInitialView(newView, leaving, removals); - viewCreator.setDaemon(true); - viewCreator.start(); + if (oldCoordinator != null && !removals.contains(oldCoordinator)) { + leaving.add(oldCoordinator); } + mbrs.removeAll(removals); + mbrs.removeAll(leaving); + newView = new NetView(this.localAddress, viewNumber, mbrs, leaving, + removals); + } + if (viewCreator == null || viewCreator.isShutdown()) { + viewCreator = new ViewCreator("GemFire Membership View Creator", Services.getThreadGroup()); + viewCreator.setInitialView(newView, leaving, removals); + viewCreator.setDaemon(true); + viewCreator.start(); } - } finally { - stateLock.writeLock().unlock(); } } @@ -1016,13 +1033,8 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler { becomeCoordinator(); } else if (this.isCoordinator) { // stop being coordinator - stateLock.writeLock().lock(); - try { - stopCoordinatorServices(); - this.isCoordinator = false; - } finally { - stateLock.writeLock().unlock(); - } + stopCoordinatorServices(); + this.isCoordinator = false; } } if (!this.isCoordinator) { @@ -1899,15 +1911,24 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler { try { List> futures; futures = svc.invokeAll(checkers); - + long giveUpTime = System.currentTimeMillis() + viewAckTimeout; for (Future future: futures) { + long now = System.currentTimeMillis(); try { - InternalDistributedMember mbr = future.get(viewAckTimeout, TimeUnit.MILLISECONDS); + InternalDistributedMember mbr = null; + long timeToWait = giveUpTime - now; + if (timeToWait <= 0) { + // TODO if timeToWait==0 is future.get() guaranteed to return immediately? + // It looks like some code paths invoke Object.wait(0), which waits forever. + timeToWait = 1; + } + mbr = future.get(timeToWait, TimeUnit.MILLISECONDS); if (mbr != null) { mbrs.remove(mbr); } } catch (java.util.concurrent.TimeoutException e) { - // TODO should the member be removed if we can't verify it in time? + // timeout - member didn't pass the final check and will not be removed + // from the collection of members } catch (ExecutionException e) { logger.info("unexpected exception caught during member verification", e); } http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/f3b1f1b1/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/messenger/JGroupsMessenger.java ---------------------------------------------------------------------- diff --git a/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/messenger/JGroupsMessenger.java b/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/messenger/JGroupsMessenger.java index 7f6a40e..2afc1f8 100755 --- a/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/messenger/JGroupsMessenger.java +++ b/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/messenger/JGroupsMessenger.java @@ -2,6 +2,7 @@ package com.gemstone.gemfire.distributed.internal.membership.gms.messenger; import static com.gemstone.gemfire.distributed.internal.membership.gms.GMSUtil.replaceStrings; import static com.gemstone.gemfire.internal.DataSerializableFixedID.JOIN_RESPONSE; +import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap; import java.io.BufferedReader; import java.io.ByteArrayInputStream; @@ -63,7 +64,6 @@ import com.gemstone.gemfire.distributed.internal.membership.gms.Services; import com.gemstone.gemfire.distributed.internal.membership.gms.interfaces.MessageHandler; import com.gemstone.gemfire.distributed.internal.membership.gms.interfaces.Messenger; import com.gemstone.gemfire.distributed.internal.membership.gms.messages.JoinResponseMessage; -import com.gemstone.gemfire.distributed.internal.membership.gms.mgr.GMSMembershipManager; import com.gemstone.gemfire.internal.ClassPathLoader; import com.gemstone.gemfire.internal.HeapDataOutputStream; import com.gemstone.gemfire.internal.OSProcess; @@ -77,8 +77,6 @@ import com.gemstone.gemfire.internal.i18n.LocalizedStrings; import com.gemstone.gemfire.internal.logging.log4j.LocalizedMessage; import com.gemstone.gemfire.internal.tcp.MemberShunnedException; -import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap; - public class JGroupsMessenger implements Messenger { private static final Logger logger = Services.getLogger(); @@ -124,9 +122,9 @@ public class JGroupsMessenger implements Messenger { private GMSPingPonger pingPonger = new GMSPingPonger(); - private volatile long pingsReceived; - private volatile long pongsReceived; + + private boolean playingDead; static { @@ -431,10 +429,12 @@ public class JGroupsMessenger implements Messenger { @Override public void playDead() { + playingDead = true; } @Override public void beHealthy() { + playingDead = false; } @Override @@ -477,6 +477,17 @@ public class JGroupsMessenger implements Messenger { throw new DistributedSystemDisconnectedException("Distributed System is shutting down"); } + if (playingDead) { + Set result = new HashSet<>(); + InternalDistributedMember[] rec = msg.getRecipients(); + if (rec != null) { + for (int i=0; i