Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 53CF4200C1E for ; Fri, 17 Feb 2017 19:22:54 +0100 (CET) Received: by cust-asf.ponee.io (Postfix) id 5260D160B46; Fri, 17 Feb 2017 18:22:54 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 5028A160B57 for ; Fri, 17 Feb 2017 19:22:53 +0100 (CET) Received: (qmail 53388 invoked by uid 500); 17 Feb 2017 18:22:52 -0000 Mailing-List: contact commits-help@geode.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@geode.apache.org Delivered-To: mailing list commits@geode.apache.org Received: (qmail 53379 invoked by uid 99); 17 Feb 2017 18:22:52 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 17 Feb 2017 18:22:52 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 6CAC3DFC1C; Fri, 17 Feb 2017 18:22:52 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: bschuchardt@apache.org To: commits@geode.apache.org Message-Id: <37d5c28c7c7d4da4a0123edbdfa6ed9a@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: geode git commit: GEODE-2497 surprise members are never timed out during startup Date: Fri, 17 Feb 2017 18:22:52 +0000 (UTC) archived-at: Fri, 17 Feb 2017 18:22:54 -0000 Repository: geode Updated Branches: refs/heads/feature/GEODE-2497 [created] 8d45ca227 GEODE-2497 surprise members are never timed out during startup Moved the creation of the timer to GMSMembershipManager.started() Removed write-lock in timer-creation method since it's only called from one place now Altered the way that the timer-creation method finds the InternalDistributedSystem. The old way of using getAnyInstance() was the primary source of the problem since it returns null until startup is completed. Altered the surprise-member unit test to ensure that it's using the timer and not relying on installation of a new membership view to clean things up. Altered the surprise-member unit test to run faster. It now completes in under 10 seconds. Project: http://git-wip-us.apache.org/repos/asf/geode/repo Commit: http://git-wip-us.apache.org/repos/asf/geode/commit/8d45ca22 Tree: http://git-wip-us.apache.org/repos/asf/geode/tree/8d45ca22 Diff: http://git-wip-us.apache.org/repos/asf/geode/diff/8d45ca22 Branch: refs/heads/feature/GEODE-2497 Commit: 8d45ca22737282abe279d3c863478f904f2e1926 Parents: dbea592 Author: Bruce Schuchardt Authored: Fri Feb 17 10:17:21 2017 -0800 Committer: Bruce Schuchardt Committed: Fri Feb 17 10:22:25 2017 -0800 ---------------------------------------------------------------------- .../gms/mgr/GMSMembershipManager.java | 75 +++++------- .../internal/DistributionManagerDUnitTest.java | 117 +++++++++---------- 2 files changed, 89 insertions(+), 103 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/geode/blob/8d45ca22/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/mgr/GMSMembershipManager.java ---------------------------------------------------------------------- diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/mgr/GMSMembershipManager.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/mgr/GMSMembershipManager.java index cf17025..6cefe4e 100644 --- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/mgr/GMSMembershipManager.java +++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/mgr/GMSMembershipManager.java @@ -608,7 +608,6 @@ public class GMSMembershipManager implements MembershipManager, Manager { } try { listener.viewInstalled(latestView); - startCleanupTimer(); } catch (DistributedSystemDisconnectedException se) { } } finally { @@ -616,6 +615,10 @@ public class GMSMembershipManager implements MembershipManager, Manager { } } + public boolean isCleanupTimerStarted() { + return this.cleanupTimer != null; + } + /** * the timer used to perform periodic tasks * @@ -767,7 +770,9 @@ public class GMSMembershipManager implements MembershipManager, Manager { } @Override - public void started() {} + public void started() { + startCleanupTimer(); + } /** this is invoked by JoinLeave when there is a loss of quorum in the membership system */ @@ -942,12 +947,6 @@ public class GMSMembershipManager implements MembershipManager, Manager { surpriseMembers.remove(member); } else { - // Now that we're sure the member is new, add them. - // make sure the surprise-member cleanup task is running - if (this.cleanupTimer == null) { - startCleanupTimer(); - } // cleanupTimer == null - // Ensure that the member is accounted for in the view // Conjure up a new view including the new member. This is necessary // because we are about to tell the listener about a new member, so @@ -978,43 +977,33 @@ public class GMSMembershipManager implements MembershipManager, Manager { /** starts periodic task to perform cleanup chores such as expire surprise members */ private void startCleanupTimer() { - latestViewWriteLock.lock(); - try { - if (this.cleanupTimer != null) { - return; - } - DistributedSystem ds = InternalDistributedSystem.getAnyInstance(); - if (ds != null && ds.isConnected()) { - this.cleanupTimer = new SystemTimer(ds, true); - SystemTimer.SystemTimerTask st = new SystemTimer.SystemTimerTask() { - @Override - public void run2() { - latestViewWriteLock.lock(); - try { - long oldestAllowed = System.currentTimeMillis() - surpriseMemberTimeout; - for (Iterator it = surpriseMembers.entrySet().iterator(); it.hasNext();) { - Map.Entry entry = (Map.Entry) it.next(); - Long birthtime = (Long) entry.getValue(); - if (birthtime.longValue() < oldestAllowed) { - it.remove(); - InternalDistributedMember m = (InternalDistributedMember) entry.getKey(); - logger.info(LocalizedMessage.create( - LocalizedStrings.GroupMembershipService_MEMBERSHIP_EXPIRING_MEMBERSHIP_OF_SURPRISE_MEMBER_0, - m)); - removeWithViewLock(m, true, - "not seen in membership view in " + surpriseMemberTimeout + "ms"); - } - } - } finally { - latestViewWriteLock.unlock(); + DistributedSystem ds = this.dcReceiver.getDM().getSystem(); + this.cleanupTimer = new SystemTimer(ds, true); + SystemTimer.SystemTimerTask st = new SystemTimer.SystemTimerTask() { + @Override + public void run2() { + latestViewWriteLock.lock(); + try { + long oldestAllowed = System.currentTimeMillis() - surpriseMemberTimeout; + for (Iterator it = surpriseMembers.entrySet().iterator(); it.hasNext();) { + Map.Entry entry = (Map.Entry) it.next(); + Long birthtime = (Long) entry.getValue(); + if (birthtime.longValue() < oldestAllowed) { + it.remove(); + InternalDistributedMember m = (InternalDistributedMember) entry.getKey(); + logger.info(LocalizedMessage.create( + LocalizedStrings.GroupMembershipService_MEMBERSHIP_EXPIRING_MEMBERSHIP_OF_SURPRISE_MEMBER_0, + m)); + removeWithViewLock(m, true, + "not seen in membership view in " + surpriseMemberTimeout + "ms"); } } - }; - this.cleanupTimer.scheduleAtFixedRate(st, surpriseMemberTimeout, surpriseMemberTimeout / 3); - } // ds != null && ds.isConnected() - } finally { - latestViewWriteLock.unlock(); - } + } finally { + latestViewWriteLock.unlock(); + } + } + }; + this.cleanupTimer.scheduleAtFixedRate(st, surpriseMemberTimeout, surpriseMemberTimeout / 3); } /** http://git-wip-us.apache.org/repos/asf/geode/blob/8d45ca22/geode-core/src/test/java/org/apache/geode/distributed/internal/DistributionManagerDUnitTest.java ---------------------------------------------------------------------- diff --git a/geode-core/src/test/java/org/apache/geode/distributed/internal/DistributionManagerDUnitTest.java b/geode-core/src/test/java/org/apache/geode/distributed/internal/DistributionManagerDUnitTest.java index ccce013..2b405fa 100644 --- a/geode-core/src/test/java/org/apache/geode/distributed/internal/DistributionManagerDUnitTest.java +++ b/geode-core/src/test/java/org/apache/geode/distributed/internal/DistributionManagerDUnitTest.java @@ -159,77 +159,74 @@ public class DistributionManagerDUnitTest extends JUnit4DistributedTestCase { **/ @Test public void testSurpriseMemberHandling() { - VM vm0 = Host.getHost(0).getVM(0); - - InternalDistributedSystem sys = getSystem(); - MembershipManager mgr = MembershipManagerHelper.getMembershipManager(sys); + System.setProperty(DistributionConfig.GEMFIRE_PREFIX + "surprise-member-timeout", "3000"); try { - InternalDistributedMember mbr = - new InternalDistributedMember(NetworkUtils.getIPLiteral(), 12345); + InternalDistributedSystem sys = getSystem(); + MembershipManager mgr = MembershipManagerHelper.getMembershipManager(sys); + assertTrue(((GMSMembershipManager) mgr).isCleanupTimerStarted()); - // first make sure we can't add this as a surprise member (bug #44566) - - // if the view number isn't being recorded correctly the test will pass but the - // functionality is broken - Assert.assertTrue("expected view ID to be greater than zero", mgr.getView().getViewId() > 0); - - int oldViewId = mbr.getVmViewId(); - mbr.setVmViewId((int) mgr.getView().getViewId() - 1); - org.apache.geode.test.dunit.LogWriterUtils.getLogWriter() - .info("current membership view is " + mgr.getView()); - org.apache.geode.test.dunit.LogWriterUtils.getLogWriter() - .info("created ID " + mbr + " with view ID " + mbr.getVmViewId()); - sys.getLogWriter() - .info("attempt to add old member"); - sys.getLogWriter() - .info("Removing shunned GemFire node"); try { - boolean accepted = mgr.addSurpriseMember(mbr); - Assert.assertTrue("member with old ID was not rejected (bug #44566)", !accepted); - } finally { + InternalDistributedMember mbr = + new InternalDistributedMember(NetworkUtils.getIPLiteral(), 12345); + + // first make sure we can't add this as a surprise member (bug #44566) + + // if the view number isn't being recorded correctly the test will pass but the + // functionality is broken + Assert.assertTrue("expected view ID to be greater than zero", + mgr.getView().getViewId() > 0); + + int oldViewId = mbr.getVmViewId(); + mbr.setVmViewId((int) mgr.getView().getViewId() - 1); + org.apache.geode.test.dunit.LogWriterUtils.getLogWriter() + .info("current membership view is " + mgr.getView()); + org.apache.geode.test.dunit.LogWriterUtils.getLogWriter() + .info("created ID " + mbr + " with view ID " + mbr.getVmViewId()); sys.getLogWriter() - .info("attempt to add old member"); + .info("attempt to add old member"); sys.getLogWriter().info( - "Removing shunned GemFire node"); - } - mbr.setVmViewId(oldViewId); - - // now forcibly add it as a surprise member and show that it is reaped - long gracePeriod = 5000; - long startTime = System.currentTimeMillis(); - long timeout = ((GMSMembershipManager) mgr).getSurpriseMemberTimeout(); - long birthTime = startTime - timeout + gracePeriod; - MembershipManagerHelper.addSurpriseMember(sys, mbr, birthTime); - assertTrue("Member was not a surprise member", mgr.isSurpriseMember(mbr)); - - // force a real view change - SerializableRunnable connectDisconnect = new SerializableRunnable() { - public void run() { - getSystem().disconnect(); + "Removing shunned GemFire node"); + try { + boolean accepted = mgr.addSurpriseMember(mbr); + Assert.assertTrue("member with old ID was not rejected (bug #44566)", !accepted); + } finally { + sys.getLogWriter().info( + "attempt to add old member"); + sys.getLogWriter().info( + "Removing shunned GemFire node"); } - }; - vm0.invoke(connectDisconnect); - - if (birthTime < (System.currentTimeMillis() - timeout)) { - return; // machine is too busy and we didn't get enough CPU to perform more assertions - } - assertTrue("Member was incorrectly removed from surprise member set", - mgr.isSurpriseMember(mbr)); + mbr.setVmViewId(oldViewId); + + // now forcibly add it as a surprise member and show that it is reaped + long gracePeriod = 5000; + long startTime = System.currentTimeMillis(); + long timeout = ((GMSMembershipManager) mgr).getSurpriseMemberTimeout(); + long birthTime = startTime - timeout + gracePeriod; + MembershipManagerHelper.addSurpriseMember(sys, mbr, birthTime); + assertTrue("Member was not a surprise member", mgr.isSurpriseMember(mbr)); + + if (birthTime < (System.currentTimeMillis() - timeout)) { + return; // machine is too busy and we didn't get enough CPU to perform more assertions + } + assertTrue("Member was incorrectly removed from surprise member set", + mgr.isSurpriseMember(mbr)); - try { - Thread.sleep(gracePeriod); - } catch (InterruptedException e) { - fail("test was interrupted", e); - } + try { + Thread.sleep((timeout / 3) + gracePeriod); + } catch (InterruptedException e) { + fail("test was interrupted", e); + } - vm0.invoke(connectDisconnect); - assertTrue("Member was not removed from surprise member set", !mgr.isSurpriseMember(mbr)); + assertTrue("Member was not removed from surprise member set", !mgr.isSurpriseMember(mbr)); - } finally { - if (sys != null && sys.isConnected()) { - sys.disconnect(); + } finally { + if (sys != null && sys.isConnected()) { + sys.disconnect(); + } } + } finally { + System.getProperties().remove(DistributionConfig.GEMFIRE_PREFIX + "surprise-member-timeout"); } }