Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 121832004F1 for ; Wed, 30 Aug 2017 18:06:59 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id 10533169627; Wed, 30 Aug 2017 16:06:59 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 60188169624 for ; Wed, 30 Aug 2017 18:06:57 +0200 (CEST) Received: (qmail 50859 invoked by uid 500); 30 Aug 2017 16:06:55 -0000 Mailing-List: contact commits-help@cloudstack.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@cloudstack.apache.org Delivered-To: mailing list commits@cloudstack.apache.org Received: (qmail 50850 invoked by uid 99); 30 Aug 2017 16:06:55 -0000 Received: from ec2-52-202-80-70.compute-1.amazonaws.com (HELO gitbox.apache.org) (52.202.80.70) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 30 Aug 2017 16:06:55 +0000 Received: by gitbox.apache.org (ASF Mail Server at gitbox.apache.org, from userid 33) id C822C81721; Wed, 30 Aug 2017 16:06:52 +0000 (UTC) Date: Wed, 30 Aug 2017 16:06:54 +0000 To: "commits@cloudstack.apache.org" Subject: [cloudstack] 03/04: CLOUDSTACK-9782: Improve scheduling of jobs MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit From: bhaisaab@apache.org Reply-To: "commits@cloudstack.apache.org" In-Reply-To: <150410921171.23522.3100231987103456492@gitbox.apache.org> References: <150410921171.23522.3100231987103456492@gitbox.apache.org> X-Git-Host: gitbox.apache.org X-Git-Repo: cloudstack X-Git-Refname: refs/heads/master X-Git-Reftype: branch X-Git-Rev: d2c3408da786518440c3946d5a37de260c5e5641 X-Git-NotificationType: diff X-Git-Multimail-Version: 1.5.dev Auto-Submitted: auto-generated Message-Id: <20170830160652.C822C81721@gitbox.apache.org> archived-at: Wed, 30 Aug 2017 16:06:59 -0000 This is an automated email from the ASF dual-hosted git repository. bhaisaab pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/cloudstack.git commit d2c3408da786518440c3946d5a37de260c5e5641 Author: Rohit Yadav AuthorDate: Fri Aug 25 12:05:11 2017 +0200 CLOUDSTACK-9782: Improve scheduling of jobs - Removed three bg thread tasks, uses FSM event-trigger based scheduling - On successful recovery, kicks VM HA - Improves overall HA scheduling and task submission, lower DB access Signed-off-by: Rohit Yadav --- api/src/org/apache/cloudstack/ha/HAConfig.java | 4 + .../hypervisor/kvm/resource/KVMHAChecker.java | 2 +- .../apache/cloudstack/ha/SimulatorHAProvider.java | 3 + .../nested-cloudstack/pom.xml | 2 +- .../org/apache/cloudstack/ha/HAManagerImpl.java | 309 +++++++++------------ .../apache/cloudstack/ha/HAResourceCounter.java | 11 +- .../apache/cloudstack/ha/provider/HAProvider.java | 9 +- .../ha/provider/host/HAAbstractHostProvider.java | 16 +- .../cloudstack/ha/task/ActivityCheckTask.java | 25 +- .../org/apache/cloudstack/ha/task/BaseHATask.java | 23 +- .../org/apache/cloudstack/ha/task/FenceTask.java | 3 +- .../apache/cloudstack/ha/task/RecoveryTask.java | 13 +- .../OutOfBandManagementServiceImpl.java | 2 +- 13 files changed, 206 insertions(+), 216 deletions(-) diff --git a/api/src/org/apache/cloudstack/ha/HAConfig.java b/api/src/org/apache/cloudstack/ha/HAConfig.java index 36fe11c..95b5c9b 100644 --- a/api/src/org/apache/cloudstack/ha/HAConfig.java +++ b/api/src/org/apache/cloudstack/ha/HAConfig.java @@ -47,8 +47,10 @@ public interface HAConfig extends StateObject, InternalIdentit ActivityCheckFailureUnderThresholdRatio, PowerCycle, Recovered, + RetryRecovery, RecoveryWaitPeriodTimeout, RecoveryOperationThresholdExceeded, + RetryFencing, Fenced; public Long getServerId() { @@ -123,6 +125,7 @@ public interface HAConfig extends StateObject, InternalIdentit FSM.addTransition(Recovering, Event.Disabled, Disabled); FSM.addTransition(Recovering, Event.Ineligible, Ineligible); + FSM.addTransition(Recovering, Event.RetryRecovery, Recovering); FSM.addTransition(Recovering, Event.Recovered, Recovered); FSM.addTransition(Recovering, Event.RecoveryOperationThresholdExceeded, Fencing); @@ -132,6 +135,7 @@ public interface HAConfig extends StateObject, InternalIdentit FSM.addTransition(Fencing, Event.Disabled, Disabled); FSM.addTransition(Fencing, Event.Ineligible, Ineligible); + FSM.addTransition(Fencing, Event.RetryFencing, Fencing); FSM.addTransition(Fencing, Event.Fenced, Fenced); FSM.addTransition(Fenced, Event.Disabled, Disabled); diff --git a/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHAChecker.java b/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHAChecker.java index c99670c..3905b1e 100644 --- a/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHAChecker.java +++ b/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHAChecker.java @@ -54,7 +54,7 @@ public class KVMHAChecker extends KVMHABase implements Callable { OutputInterpreter.OneLineParser parser = new OutputInterpreter.OneLineParser(); String result = cmd.execute(parser); s_logger.debug("KVMHAChecker pool: " + pool._poolIp); - s_logger.debug("KVMHAChecker reture: " + result); + s_logger.debug("KVMHAChecker result: " + result); s_logger.debug("KVMHAChecker parser: " + parser.getLine()); if (result == null && parser.getLine().contains("> DEAD <")) { s_logger.debug("read heartbeat failed: "); diff --git a/plugins/hypervisors/simulator/src/org/apache/cloudstack/ha/SimulatorHAProvider.java b/plugins/hypervisors/simulator/src/org/apache/cloudstack/ha/SimulatorHAProvider.java index 3c3e92f..02f4e65 100644 --- a/plugins/hypervisors/simulator/src/org/apache/cloudstack/ha/SimulatorHAProvider.java +++ b/plugins/hypervisors/simulator/src/org/apache/cloudstack/ha/SimulatorHAProvider.java @@ -72,6 +72,9 @@ public class SimulatorHAProvider extends HAAbstractHostProvider implements HAPro @Override public boolean isEligible(final Host host) { + if (host == null) { + return false; + } final SimulatorHAState haState = hostHAStateMap.get(host.getId()); return !isInMaintenanceMode(host) && !isDisabled(host) && haState != null && Hypervisor.HypervisorType.Simulator.equals(host.getHypervisorType()); diff --git a/plugins/outofbandmanagement-drivers/nested-cloudstack/pom.xml b/plugins/outofbandmanagement-drivers/nested-cloudstack/pom.xml index 34a631a..6759610 100644 --- a/plugins/outofbandmanagement-drivers/nested-cloudstack/pom.xml +++ b/plugins/outofbandmanagement-drivers/nested-cloudstack/pom.xml @@ -40,7 +40,7 @@ br.com.autonomiccs apache-cloudstack-java-client - 1.0.4 + 1.0.5 diff --git a/server/src/org/apache/cloudstack/ha/HAManagerImpl.java b/server/src/org/apache/cloudstack/ha/HAManagerImpl.java index ad3438b..c2ba528 100644 --- a/server/src/org/apache/cloudstack/ha/HAManagerImpl.java +++ b/server/src/org/apache/cloudstack/ha/HAManagerImpl.java @@ -17,32 +17,20 @@ package org.apache.cloudstack.ha; -import com.cloud.cluster.ClusterManagerListener; -import com.cloud.cluster.ManagementServerHost; -import com.cloud.dc.ClusterDetailsDao; -import com.cloud.dc.ClusterDetailsVO; -import com.cloud.dc.DataCenter; -import com.cloud.dc.DataCenterDetailVO; -import com.cloud.dc.dao.DataCenterDetailsDao; -import com.cloud.domain.Domain; -import com.cloud.event.ActionEvent; -import com.cloud.event.ActionEventUtils; -import com.cloud.event.EventTypes; -import com.cloud.ha.Investigator; -import com.cloud.host.Host; -import com.cloud.host.Status; -import com.cloud.host.dao.HostDao; -import com.cloud.org.Cluster; -import com.cloud.utils.component.ComponentContext; -import com.cloud.utils.component.ManagerBase; -import com.cloud.utils.component.PluggableService; -import com.cloud.utils.db.Transaction; -import com.cloud.utils.db.TransactionCallback; -import com.cloud.utils.db.TransactionStatus; -import com.cloud.utils.exception.CloudRuntimeException; -import com.cloud.utils.fsm.NoTransitionException; -import com.google.common.base.Preconditions; -import com.google.common.base.Strings; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ArrayBlockingQueue; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Future; +import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.TimeUnit; + +import javax.inject.Inject; +import javax.naming.ConfigurationException; + import org.apache.cloudstack.api.ApiErrorCode; import org.apache.cloudstack.api.ServerApiException; import org.apache.cloudstack.api.command.admin.ha.ConfigureHAForHostCmd; @@ -71,20 +59,36 @@ import org.apache.cloudstack.poll.BackgroundPollTask; import org.apache.cloudstack.utils.identity.ManagementServerNode; import org.apache.log4j.Logger; -import javax.inject.Inject; -import javax.naming.ConfigurationException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.concurrent.ArrayBlockingQueue; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Future; -import java.util.concurrent.ThreadPoolExecutor; -import java.util.concurrent.TimeUnit; +import com.cloud.cluster.ClusterManagerListener; +import com.cloud.cluster.ManagementServerHost; +import com.cloud.dc.ClusterDetailsDao; +import com.cloud.dc.ClusterDetailsVO; +import com.cloud.dc.DataCenter; +import com.cloud.dc.DataCenterDetailVO; +import com.cloud.dc.dao.DataCenterDetailsDao; +import com.cloud.domain.Domain; +import com.cloud.event.ActionEvent; +import com.cloud.event.ActionEventUtils; +import com.cloud.event.EventTypes; +import com.cloud.ha.Investigator; +import com.cloud.host.Host; +import com.cloud.host.Status; +import com.cloud.host.dao.HostDao; +import com.cloud.org.Cluster; +import com.cloud.utils.component.ComponentContext; +import com.cloud.utils.component.ManagerBase; +import com.cloud.utils.component.PluggableService; +import com.cloud.utils.db.Transaction; +import com.cloud.utils.db.TransactionCallback; +import com.cloud.utils.db.TransactionStatus; +import com.cloud.utils.exception.CloudRuntimeException; +import com.cloud.utils.fsm.NoTransitionException; +import com.cloud.utils.fsm.StateListener; +import com.cloud.utils.fsm.StateMachine2; +import com.google.common.base.Preconditions; +import com.google.common.base.Strings; -public final class HAManagerImpl extends ManagerBase implements HAManager, ClusterManagerListener, PluggableService, Configurable { +public final class HAManagerImpl extends ManagerBase implements HAManager, ClusterManagerListener, PluggableService, Configurable, StateListener { public static final Logger LOG = Logger.getLogger(HAManagerImpl.class); @Inject @@ -307,7 +311,7 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust LOG.debug("HA: Agent is available/suspect/checking Up " + host.getId()); } return Status.Down; - } else if (haConfig.getState() == HAConfig.HAState.Degraded || haConfig.getState() == HAConfig.HAState.Recovering || haConfig.getState() == HAConfig.HAState.Recovered || haConfig.getState() == HAConfig.HAState.Fencing) { + } else if (haConfig.getState() == HAConfig.HAState.Degraded || haConfig.getState() == HAConfig.HAState.Recovering || haConfig.getState() == HAConfig.HAState.Fencing) { if (LOG.isDebugEnabled()){ LOG.debug("HA: Agent is disconnected " + host.getId()); } @@ -455,23 +459,84 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust return cmdList; } - ////////////////////////////////////////////////////////////////// - //////////////// Clustered Manager Listeners ///////////////////// - ////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////// + //////////////// Event Listeners ///////////////////// + ////////////////////////////////////////////////////// @Override public void onManagementNodeJoined(List nodeList, long selfNodeId) { - } @Override public void onManagementNodeLeft(List nodeList, long selfNodeId) { - } @Override public void onManagementNodeIsolated() { + } + + private boolean processHAStateChange(final HAConfig haConfig, final boolean status) { + if (!status || !checkHAOwnership(haConfig)) { + return false; + } + + final HAResource resource = validateAndFindHAResource(haConfig); + if (resource == null) { + return false; + } + + final HAProvider haProvider = validateAndFindHAProvider(haConfig, resource); + if (haProvider == null) { + return false; + } + + final HAResourceCounter counter = getHACounter(haConfig.getResourceId(), haConfig.getResourceType()); + + // Perform activity checks + if (haConfig.getState() == HAConfig.HAState.Checking) { + final ActivityCheckTask job = ComponentContext.inject(new ActivityCheckTask(resource, haProvider, haConfig, + HAProviderConfig.ActivityCheckTimeout, activityCheckExecutor, counter.getSuspectTimeStamp())); + activityCheckExecutor.submit(job); + } + + // Attempt recovery + if (haConfig.getState() == HAConfig.HAState.Recovering) { + if (counter.getRecoveryCounter() >= (Long) (haProvider.getConfigValue(HAProviderConfig.MaxRecoveryAttempts, resource))) { + return false; + } + final RecoveryTask task = ComponentContext.inject(new RecoveryTask(resource, haProvider, haConfig, + HAProviderConfig.RecoveryTimeout, recoveryExecutor)); + final Future recoveryFuture = recoveryExecutor.submit(task); + counter.setRecoveryFuture(recoveryFuture); + } + + // Fencing + if (haConfig.getState() == HAConfig.HAState.Fencing) { + final FenceTask task = ComponentContext.inject(new FenceTask(resource, haProvider, haConfig, + HAProviderConfig.FenceTimeout, fenceExecutor)); + final Future fenceFuture = fenceExecutor.submit(task); + counter.setFenceFuture(fenceFuture); + } + return true; + } + + @Override + public boolean preStateTransitionEvent(final HAConfig.HAState oldState, final HAConfig.Event event, final HAConfig.HAState newState, final HAConfig haConfig, final boolean status, final Object opaque) { + if (oldState != newState || newState == HAConfig.HAState.Suspect || newState == HAConfig.HAState.Checking) { + return false; + } + if (LOG.isTraceEnabled()) { + LOG.trace("HA state pre-transition:: new state=" + newState + ", old state=" + oldState + ", for resource id=" + haConfig.getResourceId() + ", status=" + status + ", ha config state=" + haConfig.getState()); + } + return processHAStateChange(haConfig, status); + } + @Override + public boolean postStateTransitionEvent(final StateMachine2.Transition transition, final HAConfig haConfig, final boolean status, final Object opaque) { + if (LOG.isTraceEnabled()) { + LOG.trace("HA state post-transition:: new state=" + transition.getToState() + ", old state=" + transition.getCurrentState() + ", for resource id=" + haConfig.getResourceId() + ", status=" + status + ", ha config state=" + haConfig.getState()); + } + return processHAStateChange(haConfig, status); } /////////////////////////////////////////////////// @@ -523,10 +588,8 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust 0L, TimeUnit.MILLISECONDS, new ArrayBlockingQueue(fenceOperationQueueSize, true), new ThreadPoolExecutor.CallerRunsPolicy()); - pollManager.submitTask(new HealthCheckPollTask()); - pollManager.submitTask(new ActivityCheckPollTask()); - pollManager.submitTask(new RecoveryPollTask()); - pollManager.submitTask(new FencingPollTask()); + pollManager.submitTask(new HAManagerBgPollTask()); + HAConfig.HAState.getStateMachine().registerListener(this); LOG.debug("HA manager has been configured"); return true; @@ -559,7 +622,7 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust //////////////// Poll Tasks ///////////////////// ///////////////////////////////////////////////// - private final class HealthCheckPollTask extends ManagedContextRunnable implements BackgroundPollTask { + private final class HAManagerBgPollTask extends ManagedContextRunnable implements BackgroundPollTask { @Override protected void runInContext() { try { @@ -582,20 +645,6 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust continue; } - final HAResourceCounter counter = getHACounter(haConfig.getResourceId(), haConfig.getResourceType()); - - if (haConfig.getState() == HAConfig.HAState.Suspect) { - if (counter.canPerformActivityCheck((Long)(haProvider.getConfigValue(HAProviderConfig.MaxActivityCheckInterval, resource)))) { - transitionHAState(HAConfig.Event.PerformActivityCheck, haConfig); - } - } - - if (haConfig.getState() == HAConfig.HAState.Degraded) { - if (counter.canRecheckActivity((Long)(haProvider.getConfigValue(HAProviderConfig.MaxDegradedWaitTimeout, resource)))) { - transitionHAState(HAConfig.Event.PeriodicRecheckResourceActivity, haConfig); - } - } - switch (haConfig.getState()) { case Available: case Suspect: @@ -608,136 +657,44 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust default: break; } - } - } catch (Throwable t) { - LOG.error("Error trying to perform health checks in HA manager", t); - } - } - } - private final class ActivityCheckPollTask extends ManagedContextRunnable implements BackgroundPollTask { - @Override - protected void runInContext() { - try { - if (LOG.isTraceEnabled()) { - LOG.trace("HA activity check task is running..."); - } - final List haConfigList = new ArrayList(haConfigDao.listAll()); - for (final HAConfig haConfig : haConfigList) { - if (!checkHAOwnership(haConfig)) { - continue; - } - - final HAResource resource = validateAndFindHAResource(haConfig); - if (resource == null) { - continue; - } - - final HAProvider haProvider = validateAndFindHAProvider(haConfig, resource); - if (haProvider == null) { - continue; - } - - if (haConfig.getState() == HAConfig.HAState.Checking) { - final HAResourceCounter counter = getHACounter(haConfig.getResourceId(), haConfig.getResourceType()); - final ActivityCheckTask job = ComponentContext.inject(new ActivityCheckTask(resource, haProvider, haConfig, - HAProviderConfig.ActivityCheckTimeout, activityCheckExecutor, counter.getSuspectTimeStamp())); - activityCheckExecutor.submit(job); - } - } - } catch (Throwable t) { - LOG.error("Error trying to perform activity checks in HA manager", t); - } - } - } - - private final class RecoveryPollTask extends ManagedContextRunnable implements BackgroundPollTask { - @Override - protected void runInContext() { - try { - if (LOG.isTraceEnabled()) { - LOG.trace("HA recovery task is running..."); - } - final List haConfigList = new ArrayList(haConfigDao.listAll()); - for (final HAConfig haConfig : haConfigList) { - if (!checkHAOwnership(haConfig)) { - continue; - } + final HAResourceCounter counter = getHACounter(haConfig.getResourceId(), haConfig.getResourceType()); - final HAResource resource = validateAndFindHAResource(haConfig); - if (resource == null) { - continue; + if (haConfig.getState() == HAConfig.HAState.Suspect) { + if (counter.canPerformActivityCheck((Long)(haProvider.getConfigValue(HAProviderConfig.MaxActivityCheckInterval, resource)))) { + transitionHAState(HAConfig.Event.PerformActivityCheck, haConfig); + } } - final HAProvider haProvider = validateAndFindHAProvider(haConfig, resource); - if (haProvider == null) { - continue; + if (haConfig.getState() == HAConfig.HAState.Degraded) { + if (counter.canRecheckActivity((Long)(haProvider.getConfigValue(HAProviderConfig.MaxDegradedWaitTimeout, resource)))) { + transitionHAState(HAConfig.Event.PeriodicRecheckResourceActivity, haConfig); + } } - final HAResourceCounter counter = getHACounter(haConfig.getResourceId(), haConfig.getResourceType()); if (haConfig.getState() == HAConfig.HAState.Recovering) { - if (counter.canAttemptRecovery()) { - if (counter.getRecoveryCounter() >= (Long)(haProvider.getConfigValue(HAProviderConfig.MaxRecoveryAttempts, resource))) { - transitionHAState(HAConfig.Event.RecoveryOperationThresholdExceeded, haConfig); - continue; - } - - final RecoveryTask task = ComponentContext.inject(new RecoveryTask(resource, haProvider, haConfig, - HAProviderConfig.RecoveryTimeout, recoveryExecutor)); - final Future recoveryFuture = recoveryExecutor.submit(task); - counter.setRecoveryFuture(recoveryFuture); - counter.incrRecoveryCounter(); + if (counter.getRecoveryCounter() >= (Long) (haProvider.getConfigValue(HAProviderConfig.MaxRecoveryAttempts, resource))) { + transitionHAState(HAConfig.Event.RecoveryOperationThresholdExceeded, haConfig); + } else { + transitionHAState(HAConfig.Event.RetryRecovery, haConfig); } } + if (haConfig.getState() == HAConfig.HAState.Recovered) { counter.markRecoveryStarted(); if (counter.canExitRecovery((Long)(haProvider.getConfigValue(HAProviderConfig.RecoveryWaitTimeout, resource)))) { - transitionHAState(HAConfig.Event.RecoveryWaitPeriodTimeout, haConfig); - counter.markRecoveryCompleted(); + if (transitionHAState(HAConfig.Event.RecoveryWaitPeriodTimeout, haConfig)) { + counter.markRecoveryCompleted(); + } } } - } - } catch (Throwable t) { - LOG.error("Error trying to perform recovery operation in HA manager", t); - } - } - } - - private final class FencingPollTask extends ManagedContextRunnable implements BackgroundPollTask { - @Override - protected void runInContext() { - try { - if (LOG.isTraceEnabled()) { - LOG.trace("HA fencing task is running..."); - } - final List haConfigList = new ArrayList(haConfigDao.listAll()); - for (final HAConfig haConfig : haConfigList) { - if (!checkHAOwnership(haConfig)) { - continue; - } - final HAResource resource = validateAndFindHAResource(haConfig); - if (resource == null) { - continue; - } - - final HAProvider haProvider = validateAndFindHAProvider(haConfig, resource); - if (haProvider == null) { - continue; - } - - final HAResourceCounter counter = getHACounter(haConfig.getResourceId(), haConfig.getResourceType()); - if (counter.lastFencingCompleted()) { - if (haConfig.getState() == HAConfig.HAState.Fencing) { - final FenceTask task = ComponentContext.inject(new FenceTask(resource, haProvider, haConfig, - HAProviderConfig.FenceTimeout, fenceExecutor)); - final Future fenceFuture = fenceExecutor.submit(task); - counter.setFenceFuture(fenceFuture); - } + if (haConfig.getState() == HAConfig.HAState.Fencing && counter.canAttemptFencing()) { + transitionHAState(HAConfig.Event.RetryFencing, haConfig); } } } catch (Throwable t) { - LOG.error("Error trying to perform fencing operation in HA manager", t); + LOG.error("Error trying to perform health checks in HA manager", t); } } } diff --git a/server/src/org/apache/cloudstack/ha/HAResourceCounter.java b/server/src/org/apache/cloudstack/ha/HAResourceCounter.java index f955fd2..f493f69 100644 --- a/server/src/org/apache/cloudstack/ha/HAResourceCounter.java +++ b/server/src/org/apache/cloudstack/ha/HAResourceCounter.java @@ -41,7 +41,6 @@ public final class HAResourceCounter { } public synchronized void incrActivityCounter(final boolean isFailure) { - lastActivityCheckTimestamp = System.currentTimeMillis(); activityCheckCounter.incrementAndGet(); if (isFailure) { activityCheckFailureCounter.incrementAndGet(); @@ -71,8 +70,12 @@ public final class HAResourceCounter { return activityCheckFailureCounter.get() > (activityCheckCounter.get() * failureRatio); } - public boolean canPerformActivityCheck(final Long activityCheckInterval) { - return lastActivityCheckTimestamp == null || (System.currentTimeMillis() - lastActivityCheckTimestamp) > (activityCheckInterval * 1000); + public synchronized boolean canPerformActivityCheck(final Long activityCheckInterval) { + if (lastActivityCheckTimestamp == null || (System.currentTimeMillis() - lastActivityCheckTimestamp) > (activityCheckInterval * 1000)) { + lastActivityCheckTimestamp = System.currentTimeMillis(); + return true; + } + return false; } public boolean canRecheckActivity(final Long maxDegradedPeriod) { @@ -121,7 +124,7 @@ public final class HAResourceCounter { fenceFuture = future; } - public boolean lastFencingCompleted() { + public boolean canAttemptFencing() { return fenceFuture == null || fenceFuture.isDone(); } diff --git a/server/src/org/apache/cloudstack/ha/provider/HAProvider.java b/server/src/org/apache/cloudstack/ha/provider/HAProvider.java index bcc590c..9a7f27c 100644 --- a/server/src/org/apache/cloudstack/ha/provider/HAProvider.java +++ b/server/src/org/apache/cloudstack/ha/provider/HAProvider.java @@ -17,12 +17,11 @@ package org.apache.cloudstack.ha.provider; -import com.cloud.utils.component.Adapter; - import org.apache.cloudstack.ha.HAConfig; +import org.apache.cloudstack.ha.HAResource; import org.joda.time.DateTime; -import org.apache.cloudstack.ha.HAResource; +import com.cloud.utils.component.Adapter; public interface HAProvider extends Adapter { @@ -57,7 +56,9 @@ public interface HAProvider extends Adapter { boolean fence(R r) throws HAFenceException; - void setFenced(R r); + void fenceSubResources(R r); + + void enableMaintenance(R r); void sendAlert(R r, HAConfig.HAState nextState); diff --git a/server/src/org/apache/cloudstack/ha/provider/host/HAAbstractHostProvider.java b/server/src/org/apache/cloudstack/ha/provider/host/HAAbstractHostProvider.java index 43aa200..966c284 100644 --- a/server/src/org/apache/cloudstack/ha/provider/host/HAAbstractHostProvider.java +++ b/server/src/org/apache/cloudstack/ha/provider/host/HAAbstractHostProvider.java @@ -71,7 +71,7 @@ public abstract class HAAbstractHostProvider extends AdapterBase implements HAPr } @Override - public void setFenced(final Host r) { + public void fenceSubResources(final Host r) { if (r.getState() != Status.Down) { try { LOG.debug("Trying to disconnect the host without investigation and scheduling HA for the VMs on host id=" + r.getId()); @@ -80,11 +80,15 @@ public abstract class HAAbstractHostProvider extends AdapterBase implements HAPr } catch (Exception e) { LOG.error("Failed to disconnect host and schedule HA restart of VMs after fencing the host: ", e); } - try { - resourceManager.resourceStateTransitTo(r, ResourceState.Event.InternalEnterMaintenance, ManagementServerNode.getManagementServerId()); - } catch (NoTransitionException e) { - LOG.error("Failed to put host in maintenance mode after host-ha fencing and scheduling VM-HA: ", e); - } + } + } + + @Override + public void enableMaintenance(final Host r) { + try { + resourceManager.resourceStateTransitTo(r, ResourceState.Event.InternalEnterMaintenance, ManagementServerNode.getManagementServerId()); + } catch (NoTransitionException e) { + LOG.error("Failed to put host in maintenance mode after host-ha fencing and scheduling VM-HA: ", e); } } diff --git a/server/src/org/apache/cloudstack/ha/task/ActivityCheckTask.java b/server/src/org/apache/cloudstack/ha/task/ActivityCheckTask.java index ab8af61..24f9696 100644 --- a/server/src/org/apache/cloudstack/ha/task/ActivityCheckTask.java +++ b/server/src/org/apache/cloudstack/ha/task/ActivityCheckTask.java @@ -17,6 +17,10 @@ package org.apache.cloudstack.ha.task; +import java.util.concurrent.ExecutorService; + +import javax.inject.Inject; + import org.apache.cloudstack.ha.HAConfig; import org.apache.cloudstack.ha.HAManager; import org.apache.cloudstack.ha.HAResource; @@ -25,11 +29,7 @@ import org.apache.cloudstack.ha.provider.HACheckerException; import org.apache.cloudstack.ha.provider.HAProvider; import org.apache.cloudstack.ha.provider.HAProvider.HAProviderConfig; import org.apache.log4j.Logger; - -import javax.inject.Inject; - import org.joda.time.DateTime; -import java.util.concurrent.ExecutorService; public class ActivityCheckTask extends BaseHATask { @@ -38,22 +38,24 @@ public class ActivityCheckTask extends BaseHATask { @Inject private HAManager haManager; - private final long disconnectTime; + private long disconnectTime; + private long maxActivityChecks; + private double activityCheckFailureRatio; public ActivityCheckTask(final HAResource resource, final HAProvider haProvider, final HAConfig haConfig, final HAProvider.HAProviderConfig haProviderConfig, final ExecutorService executor, final long disconnectTime) { super(resource, haProvider, haConfig, haProviderConfig, executor); this.disconnectTime = disconnectTime; + this.maxActivityChecks = (Long)haProvider.getConfigValue(HAProviderConfig.MaxActivityChecks, resource); + this.activityCheckFailureRatio = (Double)haProvider.getConfigValue(HAProviderConfig.ActivityCheckFailureRatio, resource); } public boolean performAction() throws HACheckerException { return getHaProvider().hasActivity(getResource(), new DateTime(disconnectTime)); } - public void processResult(boolean result, Throwable t) { + public synchronized void processResult(boolean result, Throwable t) { final HAConfig haConfig = getHaConfig(); - final HAProvider haProvider = getHaProvider(); - final HAResource resource = getResource(); final HAResourceCounter counter = haManager.getHACounter(haConfig.getResourceId(), haConfig.getResourceType()); if (t != null && t instanceof HACheckerException) { @@ -64,18 +66,17 @@ public class ActivityCheckTask extends BaseHATask { counter.incrActivityCounter(!result); - long maxActivityChecks = (Long)haProvider.getConfigValue(HAProviderConfig.MaxActivityChecks, resource); if (counter.getActivityCheckCounter() < maxActivityChecks) { haManager.transitionHAState(HAConfig.Event.TooFewActivityCheckSamples, haConfig); return; } - double activityCheckFailureRatio = (Double)haProvider.getConfigValue(HAProviderConfig.ActivityCheckFailureRatio, resource); if (counter.hasActivityThresholdExceeded(activityCheckFailureRatio)) { haManager.transitionHAState(HAConfig.Event.ActivityCheckFailureOverThresholdRatio, haConfig); } else { - haManager.transitionHAState(HAConfig.Event.ActivityCheckFailureUnderThresholdRatio, haConfig); - counter.markResourceDegraded(); + if (haManager.transitionHAState(HAConfig.Event.ActivityCheckFailureUnderThresholdRatio, haConfig)) { + counter.markResourceDegraded(); + } } counter.resetActivityCounter(); } diff --git a/server/src/org/apache/cloudstack/ha/task/BaseHATask.java b/server/src/org/apache/cloudstack/ha/task/BaseHATask.java index 3ed8738..9c87809 100644 --- a/server/src/org/apache/cloudstack/ha/task/BaseHATask.java +++ b/server/src/org/apache/cloudstack/ha/task/BaseHATask.java @@ -17,6 +17,13 @@ package org.apache.cloudstack.ha.task; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + import org.apache.cloudstack.ha.HAConfig; import org.apache.cloudstack.ha.HAResource; import org.apache.cloudstack.ha.provider.HACheckerException; @@ -24,13 +31,7 @@ import org.apache.cloudstack.ha.provider.HAFenceException; import org.apache.cloudstack.ha.provider.HAProvider; import org.apache.cloudstack.ha.provider.HARecoveryException; import org.apache.log4j.Logger; - -import java.util.concurrent.Callable; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Future; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.TimeoutException; +import org.joda.time.DateTime; public abstract class BaseHATask implements Callable { public static final Logger LOG = Logger.getLogger(BaseHATask.class); @@ -40,6 +41,7 @@ public abstract class BaseHATask implements Callable { private final HAConfig haConfig; private final ExecutorService executor; private Long timeout; + private DateTime created; public BaseHATask(final HAResource resource, final HAProvider haProvider, final HAConfig haConfig, final HAProvider.HAProviderConfig haProviderConfig, final ExecutorService executor) { @@ -48,6 +50,7 @@ public abstract class BaseHATask implements Callable { this.haConfig = haConfig; this.executor = executor; this.timeout = (Long)haProvider.getConfigValue(haProviderConfig, resource); + this.created = new DateTime(); } public HAProvider getHaProvider() { @@ -74,6 +77,9 @@ public abstract class BaseHATask implements Callable { @Override public Boolean call() { + if (new DateTime().minusHours(1).isAfter(getCreated())) { + return false; + } final Future future = executor.submit(new Callable() { @Override public Boolean call() throws HACheckerException, HAFenceException, HARecoveryException { @@ -99,4 +105,7 @@ public abstract class BaseHATask implements Callable { return result; } + public DateTime getCreated() { + return created; + } } diff --git a/server/src/org/apache/cloudstack/ha/task/FenceTask.java b/server/src/org/apache/cloudstack/ha/task/FenceTask.java index d9fd62c..700d6b8 100644 --- a/server/src/org/apache/cloudstack/ha/task/FenceTask.java +++ b/server/src/org/apache/cloudstack/ha/task/FenceTask.java @@ -48,7 +48,8 @@ public class FenceTask extends BaseHATask { if (result) { counter.resetRecoveryCounter(); haManager.transitionHAState(HAConfig.Event.Fenced, haConfig); - getHaProvider().setFenced(getResource()); + getHaProvider().fenceSubResources(getResource()); + getHaProvider().enableMaintenance(getResource()); } getHaProvider().sendAlert(getResource(), HAConfig.HAState.Fencing); } diff --git a/server/src/org/apache/cloudstack/ha/task/RecoveryTask.java b/server/src/org/apache/cloudstack/ha/task/RecoveryTask.java index b4eb863..446dd53 100644 --- a/server/src/org/apache/cloudstack/ha/task/RecoveryTask.java +++ b/server/src/org/apache/cloudstack/ha/task/RecoveryTask.java @@ -17,16 +17,18 @@ package org.apache.cloudstack.ha.task; +import java.util.concurrent.ExecutorService; + +import javax.inject.Inject; + import org.apache.cloudstack.ha.HAConfig; import org.apache.cloudstack.ha.HAManager; import org.apache.cloudstack.ha.HAResource; +import org.apache.cloudstack.ha.HAResourceCounter; import org.apache.cloudstack.ha.provider.HACheckerException; import org.apache.cloudstack.ha.provider.HAProvider; import org.apache.cloudstack.ha.provider.HARecoveryException; -import javax.inject.Inject; -import java.util.concurrent.ExecutorService; - public class RecoveryTask extends BaseHATask { @Inject @@ -43,8 +45,13 @@ public class RecoveryTask extends BaseHATask { public void processResult(boolean result, Throwable e) { final HAConfig haConfig = getHaConfig(); + final HAResourceCounter counter = haManager.getHACounter(haConfig.getResourceId(), haConfig.getResourceType()); + counter.incrRecoveryCounter(); + counter.resetActivityCounter(); + if (result) { haManager.transitionHAState(HAConfig.Event.Recovered, haConfig); + getHaProvider().fenceSubResources(getResource()); } getHaProvider().sendAlert(getResource(), HAConfig.HAState.Recovering); } diff --git a/server/src/org/apache/cloudstack/outofbandmanagement/OutOfBandManagementServiceImpl.java b/server/src/org/apache/cloudstack/outofbandmanagement/OutOfBandManagementServiceImpl.java index fe58c64..7b09d29 100644 --- a/server/src/org/apache/cloudstack/outofbandmanagement/OutOfBandManagementServiceImpl.java +++ b/server/src/org/apache/cloudstack/outofbandmanagement/OutOfBandManagementServiceImpl.java @@ -267,7 +267,7 @@ public class OutOfBandManagementServiceImpl extends ManagerBase implements OutOf } public boolean isOutOfBandManagementEnabled(final Host host) { - return isOutOfBandManagementEnabledForZone(host.getDataCenterId()) + return host != null && isOutOfBandManagementEnabledForZone(host.getDataCenterId()) && isOutOfBandManagementEnabledForCluster(host.getClusterId()) && isOutOfBandManagementEnabledForHost(host.getId()); } -- To stop receiving notification emails like this one, please contact "commits@cloudstack.apache.org" .