Return-Path: X-Original-To: apmail-brooklyn-commits-archive@minotaur.apache.org Delivered-To: apmail-brooklyn-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 8C96511DEB for ; Sat, 7 Jun 2014 19:55:38 +0000 (UTC) Received: (qmail 96279 invoked by uid 500); 7 Jun 2014 19:55:38 -0000 Delivered-To: apmail-brooklyn-commits-archive@brooklyn.apache.org Received: (qmail 96257 invoked by uid 500); 7 Jun 2014 19:55:38 -0000 Mailing-List: contact commits-help@brooklyn.incubator.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@brooklyn.incubator.apache.org Delivered-To: mailing list commits@brooklyn.incubator.apache.org Received: (qmail 96250 invoked by uid 99); 7 Jun 2014 19:55:38 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Sat, 07 Jun 2014 19:55:38 +0000 X-ASF-Spam-Status: No, hits=-2000.7 required=5.0 tests=ALL_TRUSTED,RP_MATCHES_RCVD X-Spam-Check-By: apache.org Received: from [140.211.11.3] (HELO mail.apache.org) (140.211.11.3) by apache.org (qpsmtpd/0.29) with SMTP; Sat, 07 Jun 2014 19:55:37 +0000 Received: (qmail 96216 invoked by uid 99); 7 Jun 2014 19:55:16 -0000 Received: from tyr.zones.apache.org (HELO tyr.zones.apache.org) (140.211.11.114) by apache.org (qpsmtpd/0.29) with ESMTP; Sat, 07 Jun 2014 19:55:16 +0000 Received: by tyr.zones.apache.org (Postfix, from userid 65534) id A512D46159; Sat, 7 Jun 2014 19:55:16 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: richard@apache.org To: commits@brooklyn.incubator.apache.org Date: Sat, 07 Jun 2014 19:55:17 -0000 Message-Id: In-Reply-To: References: X-Mailer: ASF-Git Admin Mailer Subject: [2/3] git commit: Fix + test HA Manager inferring failed nodes X-Virus-Checked: Checked by ClamAV on apache.org Fix + test HA Manager inferring failed nodes Project: http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/commit/9bf833dd Tree: http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/tree/9bf833dd Diff: http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/diff/9bf833dd Branch: refs/heads/master Commit: 9bf833ddd5ed626f0d9a4326c747bc589f024cbd Parents: c1aa990 Author: Aled Sage Authored: Fri Jun 6 08:20:30 2014 +0200 Committer: Richard Downer Committed: Sat Jun 7 21:43:41 2014 +0200 ---------------------------------------------------------------------- .../ha/HighAvailabilityManagerImpl.java | 21 ++++++++-------- .../ha/HighAvailabilityManagerTest.java | 26 +++++++++++++++++++- 2 files changed, 35 insertions(+), 12 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/9bf833dd/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java b/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java index e3bc6ae..891cb1a 100644 --- a/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java +++ b/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java @@ -426,12 +426,12 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager { } /** - * @param replaceLocalNodeWithCurrentRecord - if true, the record for this mgmt node will be replaced with the + * @param reportCleanedState - if true, the record for this mgmt node will be replaced with the * actual current status known in this JVM (may be more recent than what is on disk); * normally there is no reason to care because data is persisted to disk immediately * after any significant change, but for fringe cases this is perhaps more accurate (perhaps remove in time?) */ - protected ManagementPlaneSyncRecord loadManagementPlaneSyncRecord(boolean replaceLocalNodeWithCurrentRecord) { + protected ManagementPlaneSyncRecord loadManagementPlaneSyncRecord(boolean reportCleanedState) { if (disabled) { // if HA is disabled, then we are the only node - no persistence; just load a memento to describe this node Builder builder = ManagementPlaneSyncRecordImpl.builder() @@ -448,17 +448,11 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager { try { ManagementPlaneSyncRecord result = persister.loadSyncRecord(); - // Detect AWOL nodes - result = ManagementPlaneSyncRecordImpl.builder() - .masterNodeId(result.getMasterNodeId()) - .nodes(Iterables.transform(result.getManagementNodes().values(), detectNodesGoneAwolFunction)) - .node(createManagementNodeSyncRecord()) - .build(); - - if (replaceLocalNodeWithCurrentRecord) { + if (reportCleanedState) { + // Report this nodes most recent state, and detect AWOL nodes Builder builder = ManagementPlaneSyncRecordImpl.builder() .masterNodeId(result.getMasterNodeId()) - .nodes(result.getManagementNodes().values()) + .nodes(Iterables.transform(result.getManagementNodes().values(), detectNodesGoneAwolFunction)) .node(createManagementNodeSyncRecord()); if (getNodeState() == ManagementNodeState.MASTER) { builder.masterNodeId(ownNodeId); @@ -494,11 +488,16 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager { return ticker.read(); } + /** + * Infers the health of a node - if it last reported itself as healthy (standby or master), but we haven't heard + * from it in a long time then report that node as failed; otherwise report its health as-is. + */ private class DetectNodesGoneAwol implements Function { @Nullable @Override public ManagementNodeSyncRecord apply(@Nullable ManagementNodeSyncRecord input) { if (input == null) return null; + if (!(input.getStatus() == ManagementNodeState.STANDBY || input.getStatus() == ManagementNodeState.MASTER)) return input; if (isHeartbeatOk(input, currentTimeMillis())) return input; return BasicManagementNodeSyncRecord.builder() .from(input) http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/9bf833dd/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerTest.java b/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerTest.java index 4373c75..45b6326 100644 --- a/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerTest.java +++ b/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerTest.java @@ -2,6 +2,7 @@ package brooklyn.management.ha; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertNotEquals; import static org.testng.Assert.assertTrue; import java.util.List; @@ -135,12 +136,35 @@ public class HighAvailabilityManagerTest { testGetManagementPlaneStatus(); } + @Test + public void testGetManagementPlaneSyncStateInfersTimedOutNodeAsFailed() throws Exception { + persister.delta(ManagementPlaneSyncRecordDeltaImpl.builder() + .node(newManagerMemento(ownNodeId, ManagementNodeState.STANDBY, currentTimeMillis())) + .node(newManagerMemento("node1", ManagementNodeState.MASTER, currentTimeMillis())) + .setMaster("node1") + .build()); + + manager.start(HighAvailabilityMode.AUTO); + + ManagementPlaneSyncRecord state = manager.getManagementPlaneSyncState(); + assertEquals(state.getManagementNodes().get("node1").getStatus(), ManagementNodeState.MASTER); + assertEquals(state.getManagementNodes().get(ownNodeId).getStatus(), ManagementNodeState.STANDBY); + + // Simulate passage of time; ticker used by this HA-manager so it will "correctly" publish + // its own heartbeat with the new time; but node1's record is now out-of-date. + incrementClock(31, TimeUnit.SECONDS); + + ManagementPlaneSyncRecord state2 = manager.getManagementPlaneSyncState(); + assertEquals(state2.getManagementNodes().get("node1").getStatus(), ManagementNodeState.FAILED); + assertNotEquals(state.getManagementNodes().get(ownNodeId).getStatus(), ManagementNodeState.FAILED); + } + private long currentTimeMillis() { return ticker.read(); } private long incrementClock(long increment, TimeUnit unit) { - currentTime.addAndGet(unit.toNanos(increment)); + currentTime.addAndGet(unit.toMillis(increment)); return currentTimeMillis(); }