brooklyn-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From henev...@apache.org
Subject [03/15] git commit: add a hot_standby mode, now the default when running HA (although it doesn't do anything, it is a coherent state and the MasterChooser prefers it)
Date Wed, 01 Oct 2014 18:03:03 GMT
add a hot_standby mode, now the default when running HA (although it doesn't do anything, it
is a coherent state and the MasterChooser prefers it)


Project: http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/commit/c2b05ff2
Tree: http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/tree/c2b05ff2
Diff: http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/diff/c2b05ff2

Branch: refs/heads/master
Commit: c2b05ff274fbba81e9a2fc2544b2b6587d55afae
Parents: 3ef6e25
Author: Alex Heneveld <alex.heneveld@cloudsoftcorp.com>
Authored: Fri Sep 26 13:41:46 2014 +0100
Committer: Alex Heneveld <alex.heneveld@cloudsoftcorp.com>
Committed: Wed Oct 1 16:40:15 2014 +0100

----------------------------------------------------------------------
 .../management/ha/HighAvailabilityManager.java  |   3 +-
 .../management/ha/HighAvailabilityMode.java     |  12 +-
 .../management/ha/ManagementNodeState.java      |  14 +++
 .../management/ha/BasicMasterChooser.java       | 120 ++++++++++++++-----
 .../ha/HighAvailabilityManagerImpl.java         |  77 ++++++++----
 .../ha/HighAvailabilityManagerInMemoryTest.java |   4 +
 .../HighAvailabilityManagerSplitBrainTest.java  |  34 +++---
 .../ha/HighAvailabilityManagerTestFixture.java  |  10 +-
 .../management/ha/MasterChooserTest.java        |  37 +++++-
 usage/cli/src/main/java/brooklyn/cli/Main.java  |  12 +-
 .../brooklyn/launcher/BrooklynLauncher.java     |   1 +
 .../BrooklynLauncherHighAvailabilityTest.java   |  18 +++
 12 files changed, 260 insertions(+), 82 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/c2b05ff2/api/src/main/java/brooklyn/management/ha/HighAvailabilityManager.java
----------------------------------------------------------------------
diff --git a/api/src/main/java/brooklyn/management/ha/HighAvailabilityManager.java b/api/src/main/java/brooklyn/management/ha/HighAvailabilityManager.java
index 937d25b..d9888f9 100644
--- a/api/src/main/java/brooklyn/management/ha/HighAvailabilityManager.java
+++ b/api/src/main/java/brooklyn/management/ha/HighAvailabilityManager.java
@@ -70,7 +70,8 @@ public interface HighAvailabilityManager {
      * Starts the monitoring of other nodes (and thus potential promotion of this node from
standby to master).
      * <p>
      * When this method returns, the status of this node will be set,
-     * either {@link ManagementNodeState#MASTER} if appropriate or {@link ManagementNodeState#STANDBY}.
+     * either {@link ManagementNodeState#MASTER} if appropriate 
+     * or {@link ManagementNodeState#STANDBY} / {@link ManagementNodeState#HOT_STANDBY}.
      *
      * @param startMode mode to start with
      * @throws IllegalStateException if current state of the management-plane doesn't match
that desired by {@code startMode} 

http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/c2b05ff2/api/src/main/java/brooklyn/management/ha/HighAvailabilityMode.java
----------------------------------------------------------------------
diff --git a/api/src/main/java/brooklyn/management/ha/HighAvailabilityMode.java b/api/src/main/java/brooklyn/management/ha/HighAvailabilityMode.java
index 9a154ca..29f8f7d 100644
--- a/api/src/main/java/brooklyn/management/ha/HighAvailabilityMode.java
+++ b/api/src/main/java/brooklyn/management/ha/HighAvailabilityMode.java
@@ -30,12 +30,22 @@ public enum HighAvailabilityMode {
     AUTO,
     
     /**
-     * Means node must be standby; if there is not already a master then fail fast on startup.

+     * Means node must be lukewarm standby; if there is not already a master then fail fast
on startup.
+     * See {@link ManagementNodeState#STANDBY}. 
      */
     STANDBY,
     
     /**
+     * Means node must be hot standby; if there is not already a master then fail fast on
startup.
+     * See {@link ManagementNodeState#HOT_STANDBY}. 
+     */
+    HOT_STANDBY,
+    
+    /**
      * Means node must be master; if there is already a master then fail fast on startup.
+     * See {@link ManagementNodeState#MASTER}.
      */
+    // TODO when multi-master supported we will of course not fail fast on startup when there
is already a master;
+    // instead the responsibility for master entities will be divided among masters
     MASTER;
 }

http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/c2b05ff2/api/src/main/java/brooklyn/management/ha/ManagementNodeState.java
----------------------------------------------------------------------
diff --git a/api/src/main/java/brooklyn/management/ha/ManagementNodeState.java b/api/src/main/java/brooklyn/management/ha/ManagementNodeState.java
index 00a90a4..d3b0508 100644
--- a/api/src/main/java/brooklyn/management/ha/ManagementNodeState.java
+++ b/api/src/main/java/brooklyn/management/ha/ManagementNodeState.java
@@ -19,9 +19,23 @@
 package brooklyn.management.ha;
 
 public enum ManagementNodeState {
+    /** @deprecated since 0.7.0 synonym for maintenance (plus, it should have been UK english!)
*/
     UNINITIALISED,
+    /** node is either coming online, or is in some kind of recovery/transitioning mode */
+    INITIALIZING,
+    
+    /** node is in "lukewarm standby" mode, where it is available to be promoted to master,
+     * but does not have entities loaded and will require some effort to be promoted */
     STANDBY,
+    /** node is acting as read-only proxy */
+    HOT_STANDBY,
+    /** node is running as primary/master, able to manage entities and create new ones */
+    // the semantics are intended to support multi-master here; we could have multiple master
nodes,
+    // but we need to look up who is master for any given entity
     MASTER,
+
+    /** node has failed and requires maintenance attention */
     FAILED,
+    /** node has gone away; maintenance not possible */
     TERMINATED;
 }

http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/c2b05ff2/core/src/main/java/brooklyn/management/ha/BasicMasterChooser.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/brooklyn/management/ha/BasicMasterChooser.java b/core/src/main/java/brooklyn/management/ha/BasicMasterChooser.java
index 2f82016..78c110a 100644
--- a/core/src/main/java/brooklyn/management/ha/BasicMasterChooser.java
+++ b/core/src/main/java/brooklyn/management/ha/BasicMasterChooser.java
@@ -18,18 +18,19 @@
  */
 package brooklyn.management.ha;
 
+import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
-import java.util.Map;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import brooklyn.entity.trait.Identifiable;
+import brooklyn.util.collections.MutableList;
 import brooklyn.util.time.Duration;
 
 import com.google.common.annotations.Beta;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
+import com.google.common.annotations.VisibleForTesting;
 
 /**
  * @since 0.7.0
@@ -41,33 +42,69 @@ public abstract class BasicMasterChooser implements MasterChooser {
 
     private static final Logger LOG = LoggerFactory.getLogger(BasicMasterChooser.class);
 
-    // advantage of this over taking most recent is that nodes will agree on this
-    // (where heartbeat timeout is reasonably large)
+    protected static class ScoredRecord<T extends Comparable<T>> implements Identifiable,
Comparable<ScoredRecord<T>> {
+        String id;
+        ManagementNodeSyncRecord record;
+        T score;
+        
+        @Override
+        public String getId() {
+            return id;
+        }
+
+        @Override
+        public int compareTo(ScoredRecord<T> o) {
+            return score.compareTo(o.score);
+        }
+    }
+    
+    public ManagementNodeSyncRecord choose(ManagementPlaneSyncRecord memento, Duration heartbeatTimeout,
String ownNodeId) {
+        if (LOG.isDebugEnabled()) LOG.debug("Choosing new master from "+memento.getManagementNodes());
+        ManagementNodeSyncRecord me = memento.getManagementNodes().get(ownNodeId);
+        if (me==null) {
+            LOG.warn("Management node details not known when choosing new master: "+memento+"
/ "+ownNodeId);
+            return null;
+        }
+        Long nowIsh = me.getRemoteTimestamp();
+        if (nowIsh==null) {
+            LOG.warn("Management node for self missing timestamp when choosing new master:
"+memento);
+            return null;
+        }
+        
+        List<ScoredRecord<?>> contenders = filterHealthy(memento, heartbeatTimeout,
nowIsh);
+        
+        if (!contenders.isEmpty()) {
+            return pick(contenders);
+        } else {
+            LOG.info("No valid management node found for choosing new master: contender="+memento.getManagementNodes());
+            return null;
+        }        
+    }
+
+    /** pick the best contender; argument guaranteed to be non-null and non-empty,
+     * filtered for health reasons */
+    @SuppressWarnings({ "rawtypes", "unchecked" })
+    protected ManagementNodeSyncRecord pick(List<ScoredRecord<?>> contenders)
{
+        ScoredRecord min = null;
+        for (ScoredRecord x: contenders) {
+            if (min==null || x.score.compareTo(min.score)<0) min = x;
+        }
+        return min.record;
+    }
+
     public static class AlphabeticMasterChooser extends BasicMasterChooser {
+        final boolean preferHot;
+        public AlphabeticMasterChooser(boolean preferHot) { this.preferHot = preferHot; }
+        public AlphabeticMasterChooser() { this.preferHot = true; }
         @Override
-        public ManagementNodeSyncRecord choose(ManagementPlaneSyncRecord memento, Duration
heartbeatTimeout, String ownNodeId) {
-            if (LOG.isDebugEnabled()) LOG.debug("Choosing new master from "+memento.getManagementNodes());
-            ManagementNodeSyncRecord me = memento.getManagementNodes().get(ownNodeId);
-            if (me==null) {
-                LOG.warn("Management node details not known when choosing new master: "+memento+"
/ "+ownNodeId);
-                return null;
-            }
-            Long nowIsh = me.getRemoteTimestamp();
-            if (nowIsh==null) {
-                LOG.warn("Management node for self missing timestamp when choosing new master:
"+memento);
-                return null;
-            }
-            
-            Map<String, ManagementNodeSyncRecord> contenders = filterHealthy(memento,
heartbeatTimeout, nowIsh);
-            
-            if (contenders.size() > 0) {
-                List<String> contenderIds = Lists.newArrayList(contenders.keySet());
-                Collections.sort(contenderIds);
-                return contenders.get(contenderIds.get(0));
-            } else {
-                LOG.info("No valid management node found for choosing new master: contender="+memento.getManagementNodes());
-                return null;
-            }
+        protected String score(ManagementNodeSyncRecord contender) {
+            if (!preferHot)
+                return contender.getNodeId();
+            // simple prefix with the rating
+            String state = (contender.getStatus()==ManagementNodeState.MASTER ? "1" :
+                contender.getStatus()==ManagementNodeState.HOT_STANDBY ? "2" :
+                contender.getStatus()==ManagementNodeState.STANDBY ? "3" : "9");
+            return state + ":" + contender.getNodeId();
         }
     }
     
@@ -75,11 +112,11 @@ public abstract class BasicMasterChooser implements MasterChooser {
      * Filters the {@link ManagementPlaneSyncRecord#getManagementNodes()} to only those in
an appropriate state, 
      * and with heartbeats that have not timed out.
      */
-    protected Map<String, ManagementNodeSyncRecord> filterHealthy(ManagementPlaneSyncRecord
memento, Duration heartbeatTimeout, long nowIsh) {
+    protected List<ScoredRecord<?>> filterHealthy(ManagementPlaneSyncRecord memento,
Duration heartbeatTimeout, long nowIsh) {
         long oldestAcceptableTimestamp = nowIsh - heartbeatTimeout.toMilliseconds();
-        Map<String, ManagementNodeSyncRecord> contenders = Maps.newLinkedHashMap();
+        List<ScoredRecord<?>> contenders = MutableList.of();
         for (ManagementNodeSyncRecord contender : memento.getManagementNodes().values())
{
-            boolean statusOk = (contender.getStatus() == ManagementNodeState.STANDBY || contender.getStatus()
== ManagementNodeState.MASTER);
+            boolean statusOk = (contender.getStatus() == ManagementNodeState.STANDBY || contender.getStatus()
== ManagementNodeState.HOT_STANDBY || contender.getStatus() == ManagementNodeState.MASTER);
             Long remoteTimestamp = contender.getRemoteTimestamp();
             boolean heartbeatOk;
             if (remoteTimestamp==null) {
@@ -92,10 +129,29 @@ public abstract class BasicMasterChooser implements MasterChooser {
                 heartbeatOk = remoteTimestamp >= oldestAcceptableTimestamp;
             }
             if (statusOk && heartbeatOk) {
-                contenders.put(contender.getNodeId(), contender);
+                contenders.add(newScoredRecord(contender));
             }
             if (LOG.isTraceEnabled()) LOG.trace("Filtering choices of new master: contender="+contender+";
statusOk="+statusOk+"; heartbeatOk="+heartbeatOk);
         }
         return contenders;
     }
+
+    @VisibleForTesting
+    protected List<ScoredRecord<?>> sort(List<ScoredRecord<?>> input)
{
+        ArrayList<ScoredRecord<?>> copy = new ArrayList<ScoredRecord<?>>(input);
+        Collections.sort(copy);
+        return copy;
+    }
+    
+    @SuppressWarnings({ "unchecked", "rawtypes" })
+    protected ScoredRecord<?> newScoredRecord(ManagementNodeSyncRecord contender) {
+        ScoredRecord r = new ScoredRecord();
+        r.id = contender.getNodeId();
+        r.record = contender;
+        r.score = score(contender);
+        return r;
+    }
+
+    protected abstract Comparable<?> score(ManagementNodeSyncRecord contender);
+    
 }

http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/c2b05ff2/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java b/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java
index fdfa879..2d7bf9f 100644
--- a/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java
+++ b/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java
@@ -123,7 +123,8 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager
{
     private volatile Task<?> pollingTask;
     private volatile boolean disabled;
     private volatile boolean running;
-    private volatile ManagementNodeState nodeState = ManagementNodeState.UNINITIALISED;
+    private volatile ManagementNodeState nodeState = ManagementNodeState.INITIALIZING;
+    private volatile boolean nodeStateTransitionComplete = false;
 
     public HighAvailabilityManagerImpl(ManagementContextInternal managementContext) {
         this.managementContext = managementContext;
@@ -202,6 +203,7 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager
{
     public void start(HighAvailabilityMode startMode) {
         ownNodeId = managementContext.getManagementNodeId();
         nodeState = ManagementNodeState.STANDBY;
+        nodeStateTransitionComplete = true;
         running = true;
         
         // TODO Small race in that we first check, and then we'll do checkMaster() on first
poll,
@@ -212,13 +214,13 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager
{
         case AUTO:
             // don't care; let's start and see if we promote ourselves
             publishAndCheck(true);
-            if (nodeState == ManagementNodeState.STANDBY) {
+            if (nodeState == ManagementNodeState.STANDBY || nodeState == ManagementNodeState.HOT_STANDBY)
{
                 String masterNodeId = getManagementPlaneSyncState().getMasterNodeId();
                 ManagementNodeSyncRecord masterNodeDetails = getManagementPlaneSyncState().getManagementNodes().get(masterNodeId);
-                LOG.info("Management node "+ownNodeId+" started as HA STANDBY autodetected,
master is "+masterNodeId+
+                LOG.info("Management node "+ownNodeId+" started as HA " + nodeState + " autodetected,
master is "+masterNodeId +
                     (masterNodeDetails==null || masterNodeDetails.getUri()==null ? " (no
url)" : " at "+masterNodeDetails.getUri()));
             } else {
-                LOG.info("Management node "+ownNodeId+" started as HA MASTER autodetected");
+                LOG.info("Management node "+ownNodeId+" starting as HA MASTER autodetected");
             }
             break;
         case MASTER:
@@ -230,9 +232,10 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager
{
             }
             break;
         case STANDBY:
+        case HOT_STANDBY:
             if (existingMaster != null) {
                 publishAndCheck(true);
-                LOG.info("Management node "+ownNodeId+" started as HA STANDBY explicitly,
status "+nodeState);
+                LOG.info("Management node "+ownNodeId+" started as "+startMode+" explicitly,
status "+nodeState);
             } else {
                 throw new IllegalStateException("No existing master; cannot start as standby");
             }
@@ -241,6 +244,17 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager
{
             throw new IllegalStateException("Unexpected high availability start-mode "+startMode+"
for "+this);
         }
         
+        if (nodeState==ManagementNodeState.STANDBY && startMode!=HighAvailabilityMode.STANDBY)
{
+            // if standby not explicitly requested, we need promote to hot standby
+            // TODO may want a flag which enables/disables hot standby; for now, always enabled
unless told to start in standby
+            nodeStateTransitionComplete = false;
+            // inform the world that we are transitioning (not eligible for promotion while
going in to hot standby)
+            publishHealth();
+            attemptHotStandby();
+            publishHealth();
+        }
+        
+        nodeStateTransitionComplete = true;
         registerPollTask();
     }
 
@@ -263,8 +277,23 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager
{
         }
     }
     
+    /** returns the node state this node is trying to be in */
+    public ManagementNodeState getTransitionTargetNodeState() {
+        return nodeState;
+    }
+    
+    @SuppressWarnings("deprecation")
     @Override
     public ManagementNodeState getNodeState() {
+        if (nodeState==ManagementNodeState.FAILED) return nodeState;
+        // if target is master then we claim already being master, to prevent other nodes
from taking it
+        // (we may fail subsequently of course)
+        if (nodeState==ManagementNodeState.MASTER) return nodeState;
+        
+        // for backwards compatibility; remove in 0.8.0
+        if (nodeState==ManagementNodeState.UNINITIALISED) return ManagementNodeState.INITIALIZING;
+        
+        if (!nodeStateTransitionComplete) return ManagementNodeState.INITIALIZING;
         return nodeState;
     }
 
@@ -366,12 +395,6 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager
{
         if (LOG.isTraceEnabled()) LOG.trace("Published management-node health: {}", memento);
     }
     
-    protected ManagementNodeState toNodeStateForPersistence(ManagementNodeState nodeState)
{
-        // uninitialized is set as null - TODO confirm that's necessary; nicer if we don't
need this method at all
-        if (nodeState == ManagementNodeState.UNINITIALISED) return null;
-        return nodeState;
-    }
-    
     protected boolean isHeartbeatOk(ManagementNodeSyncRecord masterNode, ManagementNodeSyncRecord
meNode) {
         if (masterNode==null) return false;
         if (meNode==null) {
@@ -455,7 +478,7 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager
{
         
         if (demotingSelfInFavourOfOtherMaster) {
             LOG.debug("Master-change for this node only, demoting "+ownNodeRecord.toVerboseString()+"
in favour of official master "+newMasterNodeRecord.toVerboseString());
-            demoteToStandby();
+            demoteToStandby(true);
             return;
         }
         
@@ -520,22 +543,29 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager
{
     
     protected void demoteToFailed() {
         nodeState = ManagementNodeState.FAILED;
-        onDemotion();
+        onDemotionStopTasks();
+        nodeStateTransitionComplete = true;
         publishDemotionFromMaster(true);
     }
 
-    protected void demoteToStandby() {
+    protected void demoteToStandby(boolean hot) {
         if (!running) {
             LOG.warn("Ignoring demote-from-master request, as HighAvailabilityManager is
no longer running");
             return;
         }
 
+        nodeStateTransitionComplete = false;
         nodeState = ManagementNodeState.STANDBY;
-        onDemotion();
+        onDemotionStopTasks();
+        if (hot) {
+            publishDemotionFromMaster(false);
+            attemptHotStandby();
+        }
+        nodeStateTransitionComplete = true;
         publishDemotionFromMaster(false);
     }
     
-    protected void onDemotion() {
+    protected void onDemotionStopTasks() {
         managementContext.getRebindManager().stop();
         for (Application app: managementContext.getApplications())
             Entities.unmanage(app);
@@ -569,6 +599,13 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager
{
             LOG.info("Cancelled "+tasks+" tasks on demotion");
     }
 
+    /** starts hot standby, in foreground; the caller is responsible for publishing health
afterwards.
+     * @return whether hot standby was possible (if not, errors should be stored elsewhere)
*/
+    protected boolean attemptHotStandby() {
+        nodeState = ManagementNodeState.HOT_STANDBY;
+        return true;
+    }
+    
     /**
      * @param reportCleanedState - if true, the record for this mgmt node will be replaced
with the
      * actual current status known in this JVM (may be more recent than what is persisted);
@@ -580,7 +617,7 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager
{
             // if HA is disabled, then we are the only node - no persistence; just load a
memento to describe this node
             Builder builder = ManagementPlaneSyncRecordImpl.builder()
                 .node(createManagementNodeSyncRecord(true));
-            if (getNodeState() == ManagementNodeState.MASTER) {
+            if (getTransitionTargetNodeState() == ManagementNodeState.MASTER) {
                 builder.masterNodeId(ownNodeId);
             }
             return builder.build();
@@ -610,7 +647,7 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager
{
                         .masterNodeId(result.getMasterNodeId())
                         .nodes(allNodes);
                     builder.node(me);
-                    if (getNodeState() == ManagementNodeState.MASTER) {
+                    if (getTransitionTargetNodeState() == ManagementNodeState.MASTER) {
                         builder.masterNodeId(ownNodeId);
                     }
                     result = builder.build();
@@ -631,7 +668,7 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager
{
         brooklyn.entity.rebind.plane.dto.BasicManagementNodeSyncRecord.Builder builder =
BasicManagementNodeSyncRecord.builder()
                 .brooklynVersion(BrooklynVersion.get())
                 .nodeId(ownNodeId)
-                .status(toNodeStateForPersistence(getNodeState()))
+                .status(getNodeState())
                 .localTimestamp(timestamp)
                 .uri(managementContext.getManagementNodeUri().orNull());
         if (useLocalTimestampAsRemoteTimestamp)
@@ -664,7 +701,7 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager
{
         @Override
         public ManagementNodeSyncRecord apply(@Nullable ManagementNodeSyncRecord input) {
             if (input == null) return null;
-            if (!(input.getStatus() == ManagementNodeState.STANDBY || input.getStatus() ==
ManagementNodeState.MASTER)) return input;
+            if (!(input.getStatus() == ManagementNodeState.STANDBY || input.getStatus() ==
ManagementNodeState.HOT_STANDBY || input.getStatus() == ManagementNodeState.MASTER)) return
input;
             if (isHeartbeatOk(input, referenceNode)) return input;
             return BasicManagementNodeSyncRecord.builder()
                     .from(input)

http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/c2b05ff2/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerInMemoryTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerInMemoryTest.java
b/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerInMemoryTest.java
index 9166202..dfabd0b 100644
--- a/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerInMemoryTest.java
+++ b/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerInMemoryTest.java
@@ -29,5 +29,9 @@ public class HighAvailabilityManagerInMemoryTest extends HighAvailabilityManager
     protected PersistenceObjectStore newPersistenceObjectStore() {
         return new InMemoryObjectStore();
     }
+    
+    public void testGetManagementPlaneStatus() throws Exception {
+        super.testGetManagementPlaneStatus();
+    }
 
 }

http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/c2b05ff2/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerSplitBrainTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerSplitBrainTest.java
b/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerSplitBrainTest.java
index f83816e..dbdf3d3 100644
--- a/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerSplitBrainTest.java
+++ b/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerSplitBrainTest.java
@@ -20,6 +20,7 @@ package brooklyn.management.ha;
 
 import static org.testng.Assert.assertEquals;
 
+import java.util.Collections;
 import java.util.Date;
 import java.util.List;
 import java.util.Map;
@@ -201,7 +202,7 @@ public class HighAvailabilityManagerSplitBrainTest {
         log.info(n2+" HA: "+memento2);
         assertEquals(memento2.getMasterNodeId(), n1.ownNodeId);
         assertEquals(memento2.getManagementNodes().get(n1.ownNodeId).getStatus(), ManagementNodeState.MASTER);
-        assertEquals(memento2.getManagementNodes().get(n2.ownNodeId).getStatus(), ManagementNodeState.STANDBY);
+        assertEquals(memento2.getManagementNodes().get(n2.ownNodeId).getStatus(), ManagementNodeState.HOT_STANDBY);
         assertEquals(memento2.getManagementNodes().get(n1.ownNodeId).getRemoteTimestamp(),
time0);
         assertEquals(memento2.getManagementNodes().get(n2.ownNodeId).getRemoteTimestamp(),
time0);
         
@@ -254,7 +255,7 @@ public class HighAvailabilityManagerSplitBrainTest {
         log.info(n1+" HA now: "+memento1b);
         
         // n1 comes back and demotes himself 
-        assertEquals(memento1b.getManagementNodes().get(n1.ownNodeId).getStatus(), ManagementNodeState.STANDBY);
+        assertEquals(memento1b.getManagementNodes().get(n1.ownNodeId).getStatus(), ManagementNodeState.HOT_STANDBY);
         assertEquals(memento1b.getManagementNodes().get(n2.ownNodeId).getStatus(), ManagementNodeState.MASTER);
         assertEquals(memento1b.getMasterNodeId(), n2.ownNodeId);
         assertEquals(memento1b.getManagementNodes().get(n1.ownNodeId).getRemoteTimestamp(),
time2);
@@ -263,7 +264,7 @@ public class HighAvailabilityManagerSplitBrainTest {
         // n2 now sees itself as master, with n1 in standby again
         ManagementPlaneSyncRecord memento2c = n2.ha.getManagementPlaneSyncState();
         log.info(n2+" HA now: "+memento2c);
-        assertEquals(memento2c.getManagementNodes().get(n1.ownNodeId).getStatus(), ManagementNodeState.STANDBY);
+        assertEquals(memento2c.getManagementNodes().get(n1.ownNodeId).getStatus(), ManagementNodeState.HOT_STANDBY);
         assertEquals(memento2c.getManagementNodes().get(n2.ownNodeId).getStatus(), ManagementNodeState.MASTER);
         assertEquals(memento2c.getMasterNodeId(), n2.ownNodeId);
         assertEquals(memento2c.getManagementNodes().get(n1.ownNodeId).getRemoteTimestamp(),
time2);
@@ -314,28 +315,33 @@ public class HighAvailabilityManagerSplitBrainTest {
             Asserts.succeedsEventually(new Runnable() {
                 @Override public void run() {
                     ManagementPlaneSyncRecord memento = nodes.get(0).ha.getManagementPlaneSyncState();
-                    int masters=0, standbys=0, savedMasters=0, savedStandbys=0;
+                    List<ManagementNodeState> counts = MutableList.of(), savedCounts
= MutableList.of();
                     for (HaMgmtNode n: nodes) {
-                        if (n.ha.getNodeState()==ManagementNodeState.MASTER) masters++;
-                        if (n.ha.getNodeState()==ManagementNodeState.STANDBY) standbys++;
+                        counts.add(n.ha.getNodeState());
                         ManagementNodeSyncRecord m = memento.getManagementNodes().get(n.ownNodeId);
                         if (m!=null) {
-                            if (m.getStatus()==ManagementNodeState.MASTER) savedMasters++;
-                            if (m.getStatus()==ManagementNodeState.STANDBY) savedStandbys++;
+                            savedCounts.add(m.getStatus());
                         }
                     }
-                    log.info("while starting "+nodes.size()+" nodes: "+masters+" M + "+standbys+"
zzz; "
+                    log.info("while starting "+nodes.size()+" nodes: "
+                        +Collections.frequency(counts, ManagementNodeState.MASTER)+" M +
"
+                        +Collections.frequency(counts, ManagementNodeState.HOT_STANDBY)+"
hot + "
+                        +Collections.frequency(counts, ManagementNodeState.STANDBY)+" warm
+ "
+                        +Collections.frequency(counts, ManagementNodeState.INITIALIZING)+"
init; "
                         + memento.getManagementNodes().size()+" saved, "
-                        + memento.getMasterNodeId()+" master, "+savedMasters+" M + "+savedStandbys+"
zzz");
+                        +Collections.frequency(savedCounts, ManagementNodeState.MASTER)+"
M + "
+                        +Collections.frequency(savedCounts, ManagementNodeState.HOT_STANDBY)+"
hot + "
+                        +Collections.frequency(savedCounts, ManagementNodeState.STANDBY)+"
warm + "
+                        +Collections.frequency(savedCounts, ManagementNodeState.INITIALIZING)+"
init");
 
                     if (timer.isRunning() && Duration.of(timer).compareTo(Duration.TEN_SECONDS)>0)
{
                         log.warn("we seem to have a problem stabilizing");  //handy place
to set a suspend-VM breakpoint!
                         timer.stop();
                     }
-                    assertEquals(masters, 1);
-                    assertEquals(standbys, nodes.size()-1);
-                    assertEquals(savedMasters, 1);
-                    assertEquals(savedStandbys, nodes.size()-1);
+                    assertEquals(Collections.frequency(counts, ManagementNodeState.MASTER),
1);
+                    assertEquals(Collections.frequency(counts, ManagementNodeState.HOT_STANDBY),
nodes.size()-1);
+                    assertEquals(Collections.frequency(savedCounts, ManagementNodeState.MASTER),
1);
+                    assertEquals(Collections.frequency(savedCounts, ManagementNodeState.HOT_STANDBY),
nodes.size()-1);
                 }});
         } catch (Throwable t) {
             log.warn("Failed to stabilize (rethrowing): "+t, t);

http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/c2b05ff2/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerTestFixture.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerTestFixture.java
b/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerTestFixture.java
index e0288e3..a657636 100644
--- a/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerTestFixture.java
+++ b/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerTestFixture.java
@@ -99,7 +99,7 @@ public abstract class HighAvailabilityManagerTestFixture {
                 .setRemoteTicker(getRemoteTicker())
                 .setPersister(persister);
         persister.delta(ManagementPlaneSyncRecordDeltaImpl.builder()
-            .node(newManagerMemento(ownNodeId, ManagementNodeState.STANDBY))
+            .node(newManagerMemento(ownNodeId, ManagementNodeState.HOT_STANDBY))
             .build());
 
     }
@@ -142,7 +142,7 @@ public abstract class HighAvailabilityManagerTestFixture {
     // next poll fixes it.
     public void testPromotes() throws Exception {
         persister.delta(ManagementPlaneSyncRecordDeltaImpl.builder()
-                .node(newManagerMemento(ownNodeId, ManagementNodeState.STANDBY))
+                .node(newManagerMemento(ownNodeId, ManagementNodeState.HOT_STANDBY))
                 .node(newManagerMemento("node1", ManagementNodeState.MASTER))
                 .setMaster("node1")
                 .build());
@@ -160,7 +160,7 @@ public abstract class HighAvailabilityManagerTestFixture {
     @Test(groups="Integration") // because one second wait in succeedsContinually
     public void testDoesNotPromoteIfMasterTimeoutNotExpired() throws Exception {
         persister.delta(ManagementPlaneSyncRecordDeltaImpl.builder()
-                .node(newManagerMemento(ownNodeId, ManagementNodeState.STANDBY))
+                .node(newManagerMemento(ownNodeId, ManagementNodeState.HOT_STANDBY))
                 .node(newManagerMemento("node1", ManagementNodeState.MASTER))
                 .setMaster("node1")
                 .build());
@@ -211,7 +211,7 @@ public abstract class HighAvailabilityManagerTestFixture {
     @Test
     public void testGetManagementPlaneSyncStateInfersTimedOutNodeAsFailed() throws Exception
{
         persister.delta(ManagementPlaneSyncRecordDeltaImpl.builder()
-                .node(newManagerMemento(ownNodeId, ManagementNodeState.STANDBY))
+                .node(newManagerMemento(ownNodeId, ManagementNodeState.HOT_STANDBY))
                 .node(newManagerMemento("node1", ManagementNodeState.MASTER))
                 .setMaster("node1")
                 .build());
@@ -220,7 +220,7 @@ public abstract class HighAvailabilityManagerTestFixture {
         
         ManagementPlaneSyncRecord state = manager.getManagementPlaneSyncState();
         assertEquals(state.getManagementNodes().get("node1").getStatus(), ManagementNodeState.MASTER);
-        assertEquals(state.getManagementNodes().get(ownNodeId).getStatus(), ManagementNodeState.STANDBY);
+        assertEquals(state.getManagementNodes().get(ownNodeId).getStatus(), ManagementNodeState.HOT_STANDBY);
         
         // Simulate passage of time; ticker used by this HA-manager so it will "correctly"
publish
         // its own heartbeat with the new time; but node1's record is now out-of-date.

http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/c2b05ff2/core/src/test/java/brooklyn/management/ha/MasterChooserTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/brooklyn/management/ha/MasterChooserTest.java b/core/src/test/java/brooklyn/management/ha/MasterChooserTest.java
index 23acc5c..6f8ba48 100644
--- a/core/src/test/java/brooklyn/management/ha/MasterChooserTest.java
+++ b/core/src/test/java/brooklyn/management/ha/MasterChooserTest.java
@@ -21,15 +21,20 @@ package brooklyn.management.ha;
 import static org.testng.Assert.assertEquals;
 import static org.testng.Assert.assertNull;
 
+import java.util.List;
+
 import org.testng.annotations.BeforeMethod;
 import org.testng.annotations.Test;
 
 import brooklyn.BrooklynVersion;
+import brooklyn.entity.basic.EntityFunctions;
 import brooklyn.entity.rebind.plane.dto.BasicManagementNodeSyncRecord;
 import brooklyn.management.ha.BasicMasterChooser.AlphabeticMasterChooser;
+import brooklyn.management.ha.BasicMasterChooser.ScoredRecord;
 import brooklyn.util.time.Duration;
 
-import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Iterables;
 
 public class MasterChooserTest {
 
@@ -67,18 +72,42 @@ public class MasterChooserTest {
         memento.addNode(newManagerMemento("node2", ManagementNodeState.STANDBY, now - 20*1000));
         memento.addNode(newManagerMemento("node3", ManagementNodeState.STANDBY, now));
         Duration heartbeatTimeout = Duration.THIRTY_SECONDS;
-        assertEquals(chooser.filterHealthy(memento, heartbeatTimeout, now).keySet(), ImmutableSet.of("node2",
"node3"));
+        assertEquals(getIds(chooser.sort(chooser.filterHealthy(memento, heartbeatTimeout,
now))), ImmutableList.of("node2", "node3"));
+    }
+    
+    protected static List<String> getIds(List<ScoredRecord<?>> filterHealthy)
{
+        return ImmutableList.copyOf(Iterables.transform(filterHealthy, EntityFunctions.id()));
+    }
+
+    @Test
+    public void testFiltersOutByStatusNotPreferringMaster() throws Exception {
+        assertEquals(doTestFiltersOutByStatus(false, false), ImmutableList.of("node4", "node5"));
+    }
+    @Test
+    public void testFiltersOutByStatusPreferringMaster() throws Exception {
+        assertEquals(doTestFiltersOutByStatus(true, false), ImmutableList.of("node5", "node4"));
     }
     
     @Test
-    public void testFiltersOutByStatus() throws Exception {
+    public void testFiltersOutByStatusNotPreferringHot() throws Exception {
+        assertEquals(doTestFiltersOutByStatus(false, true), ImmutableList.of("node4", "node5",
"node6"));
+    }
+    @Test
+    public void testFiltersOutByStatusPreferringHot() throws Exception {
+        assertEquals(doTestFiltersOutByStatus(true, true), ImmutableList.of("node5", "node6",
"node4"));
+    }
+    
+    protected List<String> doTestFiltersOutByStatus(boolean preferHot, boolean includeHot)
throws Exception {
+        chooser = new AlphabeticMasterChooser(preferHot);
         memento.addNode(newManagerMemento("node1", ManagementNodeState.FAILED, now));
         memento.addNode(newManagerMemento("node2", ManagementNodeState.TERMINATED, now));
         memento.addNode(newManagerMemento("node3", null, now));
         memento.addNode(newManagerMemento("node4",  ManagementNodeState.STANDBY, now));
         memento.addNode(newManagerMemento("node5", ManagementNodeState.MASTER, now));
+        if (includeHot)
+            memento.addNode(newManagerMemento("node6",  ManagementNodeState.HOT_STANDBY,
now));
         Duration heartbeatTimeout = Duration.THIRTY_SECONDS;
-        assertEquals(chooser.filterHealthy(memento, heartbeatTimeout, now).keySet(), ImmutableSet.of("node4",
"node5"));
+        return getIds(chooser.sort(chooser.filterHealthy(memento, heartbeatTimeout, now)));
     }
     
     private ManagementNodeSyncRecord newManagerMemento(String nodeId, ManagementNodeState
status, long timestamp) {

http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/c2b05ff2/usage/cli/src/main/java/brooklyn/cli/Main.java
----------------------------------------------------------------------
diff --git a/usage/cli/src/main/java/brooklyn/cli/Main.java b/usage/cli/src/main/java/brooklyn/cli/Main.java
index 093f307..681e7a0 100644
--- a/usage/cli/src/main/java/brooklyn/cli/Main.java
+++ b/usage/cli/src/main/java/brooklyn/cli/Main.java
@@ -298,16 +298,18 @@ public class Main extends AbstractMain {
         protected final static String HA_OPTION_AUTO = "auto";
         protected final static String HA_OPTION_MASTER = "master";
         protected final static String HA_OPTION_STANDBY = "standby";
-        static { Enums.checkAllEnumeratedIgnoreCase(HighAvailabilityMode.class, HA_OPTION_AUTO,
HA_OPTION_DISABLED, HA_OPTION_MASTER, HA_OPTION_STANDBY); }
+        protected final static String HA_OPTION_HOT_STANDBY = "hot_standby";
+        static { Enums.checkAllEnumeratedIgnoreCase(HighAvailabilityMode.class, HA_OPTION_AUTO,
HA_OPTION_DISABLED, HA_OPTION_MASTER, HA_OPTION_STANDBY, HA_OPTION_HOT_STANDBY); }
         
-        @Option(name = { HA_OPTION }, allowedValues = { HA_OPTION_DISABLED, HA_OPTION_AUTO,
HA_OPTION_MASTER, HA_OPTION_STANDBY },
+        @Option(name = { HA_OPTION }, allowedValues = { HA_OPTION_DISABLED, HA_OPTION_AUTO,
HA_OPTION_MASTER, HA_OPTION_STANDBY, HA_OPTION_HOT_STANDBY },
                 title = "high availability mode",
                 description =
                         "The high availability mode. Possible values are: \n"+
                         "disabled: management node works in isolation - will not cooperate
with any other standby/master nodes in management plane; \n"+
                         "auto: will look for other management nodes, and will allocate itself
as standby or master based on other nodes' states; \n"+
                         "master: will startup as master - if there is already a master then
fails immediately; \n"+
-                        "standby: will start up as standby - if there is not already a master
then fails immediately")
+                        "standby: will start up as lukewarm standby - if there is not already
a master then fails immediately; \n"+
+                        "hot_standby: will start up as hot standby - if there is not already
a master then fails immediately")
         public String highAvailability = HA_OPTION_AUTO;
 
         @VisibleForTesting
@@ -456,8 +458,8 @@ public class Main extends AbstractMain {
                     if (highAvailabilityMode.get() == HighAvailabilityMode.AUTO)
                         return HighAvailabilityMode.DISABLED;
                     throw new FatalConfigurationRuntimeException("Cannot specify highAvailability
when persistence is disabled");
-                } else if (persistMode == PersistMode.CLEAN && highAvailabilityMode.get()
== HighAvailabilityMode.STANDBY) {
-                    throw new FatalConfigurationRuntimeException("Cannot specify highAvailability
STANDBY when persistence is CLEAN");
+                } else if (persistMode == PersistMode.CLEAN && (highAvailabilityMode.get()
== HighAvailabilityMode.STANDBY || highAvailabilityMode.get() == HighAvailabilityMode.HOT_STANDBY))
{
+                    throw new FatalConfigurationRuntimeException("Cannot specify highAvailability
"+highAvailabilityMode.get()+" when persistence is CLEAN");
                 }
             }
             return highAvailabilityMode.get();

http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/c2b05ff2/usage/launcher/src/main/java/brooklyn/launcher/BrooklynLauncher.java
----------------------------------------------------------------------
diff --git a/usage/launcher/src/main/java/brooklyn/launcher/BrooklynLauncher.java b/usage/launcher/src/main/java/brooklyn/launcher/BrooklynLauncher.java
index dd5db0e..47d80a4 100644
--- a/usage/launcher/src/main/java/brooklyn/launcher/BrooklynLauncher.java
+++ b/usage/launcher/src/main/java/brooklyn/launcher/BrooklynLauncher.java
@@ -758,6 +758,7 @@ public class BrooklynLauncher {
                 case AUTO:
                 case MASTER:
                 case STANDBY:
+                case HOT_STANDBY:
                     startMode = highAvailabilityMode;
                     break;
                 case DISABLED:

http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/c2b05ff2/usage/launcher/src/test/java/brooklyn/launcher/BrooklynLauncherHighAvailabilityTest.java
----------------------------------------------------------------------
diff --git a/usage/launcher/src/test/java/brooklyn/launcher/BrooklynLauncherHighAvailabilityTest.java
b/usage/launcher/src/test/java/brooklyn/launcher/BrooklynLauncherHighAvailabilityTest.java
index 949638f..419124d 100644
--- a/usage/launcher/src/test/java/brooklyn/launcher/BrooklynLauncherHighAvailabilityTest.java
+++ b/usage/launcher/src/test/java/brooklyn/launcher/BrooklynLauncherHighAvailabilityTest.java
@@ -216,6 +216,24 @@ public class BrooklynLauncherHighAvailabilityTest {
         }
     }
     
+    @Test
+    public void testHighAvailabilityHotStandbyModeFailsIfNoExistingMaster() throws Exception
{
+        try {
+            primary = BrooklynLauncher.newInstance();
+            primary.webconsole(false)
+                    .brooklynProperties(LocalManagementContextForTests.setEmptyCatalogAsDefault(BrooklynProperties.Factory.newEmpty()))
+                    .highAvailabilityMode(HighAvailabilityMode.HOT_STANDBY)
+                    .persistMode(PersistMode.AUTO)
+                    .persistenceDir(persistenceDir)
+                    .persistPeriod(Duration.millis(10))
+                    .application(EntitySpec.create(TestApplication.class))
+                    .start();
+            fail();
+        } catch (IllegalStateException e) {
+            // success
+        }
+    }
+    
     private void assertOnlyApp(ManagementContext managementContext, Class<? extends Application>
expectedType) {
         assertEquals(managementContext.getApplications().size(), 1, "apps="+managementContext.getApplications());
         assertNotNull(Iterables.find(managementContext.getApplications(), Predicates.instanceOf(TestApplication.class),
null), "apps="+managementContext.getApplications());


Mime
View raw message