cloudstack-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From wid...@apache.org
Subject [41/50] [abbrv] git commit: updated refs/heads/reporter to 3e1816d
Date Mon, 27 Jul 2015 08:43:29 GMT
CLOUDSTACK-8666: Put host in Alert state only after alert.wait timeout
Instead of putting the host to Alert state immediately, the investigators should be allowed
to run for some time based on alert.wait global config.
At the end of this interval if the host state still cannot be determined then put the host
in Alert. Also updated some of the log messages.

This closes #621


Project: http://git-wip-us.apache.org/repos/asf/cloudstack/repo
Commit: http://git-wip-us.apache.org/repos/asf/cloudstack/commit/090db058
Tree: http://git-wip-us.apache.org/repos/asf/cloudstack/tree/090db058
Diff: http://git-wip-us.apache.org/repos/asf/cloudstack/diff/090db058

Branch: refs/heads/reporter
Commit: 090db05821a100ead24dee90658d5b0a863a8682
Parents: 7febdb5
Author: Koushik Das <koushik@apache.org>
Authored: Thu Jul 23 17:57:51 2015 +0530
Committer: Koushik Das <koushik@apache.org>
Committed: Fri Jul 24 12:16:07 2015 +0530

----------------------------------------------------------------------
 .../cloud/agent/manager/AgentManagerImpl.java   | 32 +++++++++++---------
 .../cloud/ha/HighAvailabilityManagerImpl.java   |  2 +-
 2 files changed, 19 insertions(+), 15 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/cloudstack/blob/090db058/engine/orchestration/src/com/cloud/agent/manager/AgentManagerImpl.java
----------------------------------------------------------------------
diff --git a/engine/orchestration/src/com/cloud/agent/manager/AgentManagerImpl.java b/engine/orchestration/src/com/cloud/agent/manager/AgentManagerImpl.java
index 64aaf48..a38fd08 100644
--- a/engine/orchestration/src/com/cloud/agent/manager/AgentManagerImpl.java
+++ b/engine/orchestration/src/com/cloud/agent/manager/AgentManagerImpl.java
@@ -824,25 +824,30 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager,
Handl
                 /* OK, we are going to the bad status, let's see what happened */
                 s_logger.info("Investigating why host " + hostId + " has disconnected with
event " + event);
 
-                final Status determinedState = investigate(attache);
+                Status determinedState = investigate(attache);
                 // if state cannot be determined do nothing and bail out
                 if (determinedState == null) {
-                    s_logger.warn("Agent state cannot be determined, do nothing");
-                    return false;
+                    if (((System.currentTimeMillis() >> 10) - host.getLastPinged())
> AlertWait.value()) {
+                        s_logger.warn("Agent " + hostId + " state cannot be determined for
more than " + AlertWait + "(" + AlertWait.value() + ") seconds, will go to Alert state");
+                        determinedState = Status.Alert;
+                    } else {
+                        s_logger.warn("Agent " + hostId + " state cannot be determined, do
nothing");
+                        return false;
+                    }
                 }
 
                 final Status currentStatus = host.getStatus();
-                s_logger.info("The state determined is " + determinedState);
+                s_logger.info("The agent " + hostId + " state determined is " + determinedState);
 
                 if (determinedState == Status.Down) {
-                    s_logger.error("Host is down: " + host.getId() + "-" + host.getName()
+ ".  Starting HA on the VMs");
+                    String message = "Host is down: " + host.getId() + "-" + host.getName()
+ ". Starting HA on the VMs";
+                    s_logger.error(message);
                     if (host.getType() != Host.Type.SecondaryStorage && host.getType()
!= Host.Type.ConsoleProxy) {
-                        _alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(),
host.getPodId(), "Host disconnected, " + host.getId(),
-                                "Host is down: " + host.getId() + "-" + host.getName() +
".  Starting HA on the VMs");
+                        _alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(),
host.getPodId(), "Host down, " + host.getId(), message);
                     }
                     event = Status.Event.HostDown;
                 } else if (determinedState == Status.Up) {
-                    /* Got ping response from host, bring it back*/
+                    /* Got ping response from host, bring it back */
                     s_logger.info("Agent is determined to be up and running");
                     agentStatusTransitTo(host, Status.Event.Ping, _nodeId);
                     return false;
@@ -850,10 +855,10 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager,
Handl
                     s_logger.warn("Agent is disconnected but the host is still up: " + host.getId()
+ "-" + host.getName());
                     if (currentStatus == Status.Disconnected) {
                         if ((System.currentTimeMillis() >> 10) - host.getLastPinged()
> AlertWait.value()) {
-                            s_logger.warn("Host " + host.getId() + " has been disconnected
pass the time it should be disconnected.");
+                            s_logger.warn("Host " + host.getId() + " has been disconnected
past the wait time it should be disconnected.");
                             event = Status.Event.WaitedTooLong;
                         } else {
-                            s_logger.debug("Host has been determined to be disconnected but
it hasn't passed the wait time yet.");
+                            s_logger.debug("Host " + host.getId() + " has been determined
to be disconnected but it hasn't passed the wait time yet.");
                             return false;
                         }
                     } else if (currentStatus == Status.Up) {
@@ -862,7 +867,7 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager,
Handl
                         final String hostDesc = "name: " + host.getName() + " (id:" + host.getId()
+ "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName();
                         if (host.getType() != Host.Type.SecondaryStorage && host.getType()
!= Host.Type.ConsoleProxy) {
                             _alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(),
host.getPodId(), "Host disconnected, " + hostDesc,
-                                    "If the agent for host [" + hostDesc + "] is not restarted
within " + AlertWait + " seconds, HA will begin on the VMs");
+                                    "If the agent for host [" + hostDesc + "] is not restarted
within " + AlertWait + " seconds, host will go to Alert state");
                         }
                         event = Status.Event.AgentDisconnected;
                     }
@@ -872,11 +877,10 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager,
Handl
                     final HostPodVO podVO = _podDao.findById(host.getPodId());
                     final String hostDesc = "name: " + host.getName() + " (id:" + host.getId()
+ "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName();
                     _alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(),
host.getPodId(), "Host in ALERT state, " + hostDesc,
-                            "In availability zone " + host.getDataCenterId() + ", " + host.getId()
+ "-" + host.getName()
-                            + " disconnect due to event " + event + ", ms can't determine
the host status" );
+                            "In availability zone " + host.getDataCenterId() + ", host is
in alert state: " + host.getId() + "-" + host.getName());
                 }
             } else {
-                s_logger.debug("The next status of Agent " + host.getId() + " is not Alert,
no need to investigate what happened");
+                s_logger.debug("The next status of agent " + host.getId() + " is not Alert,
no need to investigate what happened");
             }
         }
         handleDisconnectWithoutInvestigation(attache, event, true, true);

http://git-wip-us.apache.org/repos/asf/cloudstack/blob/090db058/server/src/com/cloud/ha/HighAvailabilityManagerImpl.java
----------------------------------------------------------------------
diff --git a/server/src/com/cloud/ha/HighAvailabilityManagerImpl.java b/server/src/com/cloud/ha/HighAvailabilityManagerImpl.java
index 1ecdfcd..0a179a0 100644
--- a/server/src/com/cloud/ha/HighAvailabilityManagerImpl.java
+++ b/server/src/com/cloud/ha/HighAvailabilityManagerImpl.java
@@ -220,7 +220,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements
HighAvai
             }
         }
 
-        return Status.Alert;
+        return hostState;
     }
 
     @Override


Mime
View raw message