cloudstack-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kous...@apache.org
Subject git commit: updated refs/heads/4.2 to a81cc8a
Date Wed, 10 Jul 2013 06:53:24 GMT
Updated Branches:
  refs/heads/4.2 be6bc2b68 -> a81cc8a12


CLOUDSTACK-2918: In a scaled up environment, hosts fail to come up after Management server
restart in clustered set up
Summary of changes in the fix
- Optimized host scan logic, now instead of iterating over each cluster host scan is done
for a batch of clusters
- Made host scan task interval configurable


Project: http://git-wip-us.apache.org/repos/asf/cloudstack/repo
Commit: http://git-wip-us.apache.org/repos/asf/cloudstack/commit/a81cc8a1
Tree: http://git-wip-us.apache.org/repos/asf/cloudstack/tree/a81cc8a1
Diff: http://git-wip-us.apache.org/repos/asf/cloudstack/diff/a81cc8a1

Branch: refs/heads/4.2
Commit: a81cc8a12d58a4d30a8063b9518a7a97707035a9
Parents: be6bc2b
Author: Koushik Das <koushik.das@citrix.com>
Authored: Wed Jul 10 12:13:24 2013 +0530
Committer: Koushik Das <koushik.das@citrix.com>
Committed: Wed Jul 10 12:22:39 2013 +0530

----------------------------------------------------------------------
 .../src/com/cloud/host/dao/HostDaoImpl.java     | 141 ++++++++++++++-----
 .../manager/ClusteredAgentManagerImpl.java      |   9 +-
 server/src/com/cloud/configuration/Config.java  |   1 +
 setup/db/db/schema-410to420.sql                 |   2 +
 4 files changed, 117 insertions(+), 36 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/cloudstack/blob/a81cc8a1/engine/schema/src/com/cloud/host/dao/HostDaoImpl.java
----------------------------------------------------------------------
diff --git a/engine/schema/src/com/cloud/host/dao/HostDaoImpl.java b/engine/schema/src/com/cloud/host/dao/HostDaoImpl.java
index 810b973..a84527e 100755
--- a/engine/schema/src/com/cloud/host/dao/HostDaoImpl.java
+++ b/engine/schema/src/com/cloud/host/dao/HostDaoImpl.java
@@ -21,6 +21,7 @@ import java.sql.ResultSet;
 import java.sql.SQLException;
 import java.util.ArrayList;
 import java.util.Date;
+import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.TimeZone;
@@ -116,6 +117,7 @@ public class HostDaoImpl extends GenericDaoBase<HostVO, Long> implements
HostDao
 
     protected SearchBuilder<HostVO> HostsForReconnectSearch;
     protected GenericSearchBuilder<HostVO, Long> ClustersOwnedByMSSearch;
+    protected GenericSearchBuilder<HostVO, Long> ClustersForHostsNotOwnedByAnyMSSearch;
     protected GenericSearchBuilder<ClusterVO, Long> AllClustersSearch;
     protected SearchBuilder<HostVO> HostsInClusterSearch;
     
@@ -264,7 +266,7 @@ public class HostDaoImpl extends GenericDaoBase<HostVO, Long> implements
HostDao
         UnmanagedDirectConnectSearch.and("server", UnmanagedDirectConnectSearch.entity().getManagementServerId(),
SearchCriteria.Op.NULL);
         UnmanagedDirectConnectSearch.and("lastPinged", UnmanagedDirectConnectSearch.entity().getLastPinged(),
SearchCriteria.Op.LTEQ);
         UnmanagedDirectConnectSearch.and("resourceStates", UnmanagedDirectConnectSearch.entity().getResourceState(),
SearchCriteria.Op.NIN);
-        UnmanagedDirectConnectSearch.and("cluster", UnmanagedDirectConnectSearch.entity().getClusterId(),
SearchCriteria.Op.EQ);
+        UnmanagedDirectConnectSearch.and("clusterIn", UnmanagedDirectConnectSearch.entity().getClusterId(),
SearchCriteria.Op.IN);
         /*
          * UnmanagedDirectConnectSearch.op(SearchCriteria.Op.OR, "managementServerId",
          * UnmanagedDirectConnectSearch.entity().getManagementServerId(), SearchCriteria.Op.EQ);
@@ -353,6 +355,13 @@ public class HostDaoImpl extends GenericDaoBase<HostVO, Long> implements
HostDao
         ClustersOwnedByMSSearch.and("server", ClustersOwnedByMSSearch.entity().getManagementServerId(),
SearchCriteria.Op.EQ);
         ClustersOwnedByMSSearch.done();
 
+        ClustersForHostsNotOwnedByAnyMSSearch = createSearchBuilder(Long.class);
+        ClustersForHostsNotOwnedByAnyMSSearch.select(null, Func.DISTINCT, ClustersForHostsNotOwnedByAnyMSSearch.entity().getClusterId());
+        ClustersForHostsNotOwnedByAnyMSSearch.and("resource", ClustersForHostsNotOwnedByAnyMSSearch.entity().getResource(),
SearchCriteria.Op.NNULL);
+        ClustersForHostsNotOwnedByAnyMSSearch.and("cluster", ClustersForHostsNotOwnedByAnyMSSearch.entity().getClusterId(),
SearchCriteria.Op.NNULL);
+        ClustersForHostsNotOwnedByAnyMSSearch.and("server", ClustersForHostsNotOwnedByAnyMSSearch.entity().getManagementServerId(),
SearchCriteria.Op.NULL);
+        ClustersForHostsNotOwnedByAnyMSSearch.done();
+
         AllClustersSearch = _clusterDao.createSearchBuilder(Long.class);
         AllClustersSearch.select(null, Func.NATIVE, AllClustersSearch.entity().getId());
         AllClustersSearch.and("managed", AllClustersSearch.entity().getManagedState(), SearchCriteria.Op.EQ);
@@ -409,10 +418,17 @@ public class HostDaoImpl extends GenericDaoBase<HostVO, Long>
implements HostDao
         sc.setParameters("lastPinged", lastPingSecondsAfter);
         sc.setParameters("status", Status.Disconnected, Status.Down, Status.Alert);
 
+        StringBuilder sb = new StringBuilder();
         List<HostVO> hosts = lockRows(sc, null, true); // exclusive lock
         for (HostVO host : hosts) {
             host.setManagementServerId(null);
             update(host.getId(), host);
+            sb.append(host.getId());
+            sb.append(" ");
+        }
+
+        if (s_logger.isTraceEnabled()) {
+            s_logger.trace("Following hosts got reset: " + sb.toString());
         }
     }
 
@@ -428,6 +444,16 @@ public class HostDaoImpl extends GenericDaoBase<HostVO, Long> implements
HostDao
     }
 
     /*
+     * Returns clusters based on the list of hosts not owned by any MS
+     */
+    private List<Long> findClustersForHostsNotOwnedByAnyManagementServer() {
+        SearchCriteria<Long> sc = ClustersForHostsNotOwnedByAnyMSSearch.create();
+
+        List<Long> clusters = customSearch(sc, null);
+        return clusters;
+    }
+
+    /*
      * Returns a list of all cluster Ids
      */
     private List<Long> listAllClusters() {
@@ -459,55 +485,100 @@ public class HostDaoImpl extends GenericDaoBase<HostVO, Long>
implements HostDao
     public List<HostVO> findAndUpdateDirectAgentToLoad(long lastPingSecondsAfter, Long
limit, long managementServerId) {
         Transaction txn = Transaction.currentTxn();
 
-        // reset hosts that are suitable candidates for reconnect
         txn.start();
+        if (s_logger.isDebugEnabled()) {
+            s_logger.debug("Resetting hosts suitable for reconnect");
+        }
+        // reset hosts that are suitable candidates for reconnect
         resetHosts(managementServerId, lastPingSecondsAfter);
-        txn.commit();
-
-        List<Long> clusters = findClustersOwnedByManagementServer(managementServerId);
-        List<Long> allClusters = listAllClusters();
+        if (s_logger.isDebugEnabled()) {
+            s_logger.debug("Completed resetting hosts suitable for reconnect");
+        }
 
-        SearchCriteria<HostVO> sc = UnmanagedDirectConnectSearch.create();
-        sc.setParameters("lastPinged", lastPingSecondsAfter);
-        sc.setJoinParameters("ClusterManagedSearch", "managed", Managed.ManagedState.Managed);
         List<HostVO> assignedHosts = new ArrayList<HostVO>();
-        List<Long> remainingClusters = new ArrayList<Long>();
 
-        // handle clusters already owned by @managementServerId
-        txn.start();
-        for (Long clusterId : allClusters) {
-            if (clusters.contains(clusterId)) { // host belongs to clusters owned by @managementServerId
-                sc.setParameters("cluster", clusterId);
-                List<HostVO> unmanagedHosts = lockRows(sc, null, true);
-                for (HostVO host : unmanagedHosts) {
-                    host.setManagementServerId(managementServerId);
-                    update(host.getId(), host);
-                    assignedHosts.add(host);
-                }
-            } else {
-                remainingClusters.add(clusterId);
+        if (s_logger.isDebugEnabled()) {
+            s_logger.debug("Acquiring hosts for clusters already owned by this management
server");
+        }
+        List<Long> clusters = findClustersOwnedByManagementServer(managementServerId);
+        if (clusters.size() > 0) {
+            // handle clusters already owned by @managementServerId
+            SearchCriteria<HostVO> sc = UnmanagedDirectConnectSearch.create();
+            sc.setParameters("lastPinged", lastPingSecondsAfter);
+            sc.setJoinParameters("ClusterManagedSearch", "managed", Managed.ManagedState.Managed);
+            sc.setParameters("clusterIn", clusters.toArray());
+            List<HostVO> unmanagedHosts = lockRows(sc, new Filter(HostVO.class, "clusterId",
true, 0L, limit), true); // host belongs to clusters owned by @managementServerId
+            StringBuilder sb = new StringBuilder();
+            for (HostVO host : unmanagedHosts) {
+                host.setManagementServerId(managementServerId);
+                update(host.getId(), host);
+                assignedHosts.add(host);
+                sb.append(host.getId());
+                sb.append(" ");
+            }
+            if (s_logger.isTraceEnabled()) {
+                s_logger.trace("Following hosts got acquired for clusters already owned:
" + sb.toString());
             }
         }
-        txn.commit();
+        if (s_logger.isDebugEnabled()) {
+            s_logger.debug("Completed acquiring hosts for clusters already owned by this
management server");
+        }
 
-        // for remaining clusters check if they can be owned
-        for (Long clusterId : remainingClusters) {
-            txn.start();
-            sc.setParameters("cluster", clusterId);
-            List<HostVO> unmanagedHosts = lockRows(sc, null, true);
-            if (canOwnCluster(clusterId)) { // cluster is not owned by any other MS, so @managementServerId
can own it
+        if (assignedHosts.size() < limit) {
+            if (s_logger.isDebugEnabled()) {
+                s_logger.debug("Acquiring hosts for clusters not owned by any management
server");
+            }
+            // for remaining hosts not owned by any MS check if they can be owned (by owning
full cluster)
+            clusters = findClustersForHostsNotOwnedByAnyManagementServer();
+            List<Long> updatedClusters = clusters;
+            if (clusters.size() > limit) {
+                updatedClusters = clusters.subList(0, limit.intValue());
+            }
+            if (updatedClusters.size() > 0) {
+                SearchCriteria<HostVO> sc = UnmanagedDirectConnectSearch.create();
+                sc.setParameters("lastPinged", lastPingSecondsAfter);
+                sc.setJoinParameters("ClusterManagedSearch", "managed", Managed.ManagedState.Managed);
+                sc.setParameters("clusterIn", updatedClusters.toArray());
+                List<HostVO> unmanagedHosts = lockRows(sc, null, true);
+
+                // group hosts based on cluster
+                Map<Long, List<HostVO>> hostMap = new HashMap<Long, List<HostVO>>();
                 for (HostVO host : unmanagedHosts) {
-                    host.setManagementServerId(managementServerId);
-                    update(host.getId(), host);
-                    assignedHosts.add(host);
+                    if (hostMap.get(host.getClusterId()) == null) {
+                        hostMap.put(host.getClusterId(), new ArrayList<HostVO>());
+                    }
+                    hostMap.get(host.getClusterId()).add(host);
+                }
+
+                StringBuilder sb = new StringBuilder();
+                for (Long clusterId : hostMap.keySet()) {
+                    if (canOwnCluster(clusterId)) { // cluster is not owned by any other
MS, so @managementServerId can own it
+                        List<HostVO> hostList = hostMap.get(clusterId);
+                        for (HostVO host : hostList) {
+                            host.setManagementServerId(managementServerId);
+                            update(host.getId(), host);
+                            assignedHosts.add(host);
+                            sb.append(host.getId());
+                            sb.append(" ");
+                        }
+                    }
+                    if (assignedHosts.size() > limit) {
+                        break;
+                    }
                 }
+                if (s_logger.isTraceEnabled()) {
+                    s_logger.trace("Following hosts got acquired from newly owned clusters:
" + sb.toString());
+                }
+            }
+            if (s_logger.isDebugEnabled()) {
+                s_logger.debug("Completed acquiring hosts for clusters not owned by any management
server");
             }
-            txn.commit();
         }
+        txn.commit();
 
         return assignedHosts;
     }
-    
+
     @Override @DB
     public List<HostVO> findAndUpdateApplianceToLoad(long lastPingSecondsAfter, long
managementServerId) {
     	Transaction txn = Transaction.currentTxn();

http://git-wip-us.apache.org/repos/asf/cloudstack/blob/a81cc8a1/server/src/com/cloud/agent/manager/ClusteredAgentManagerImpl.java
----------------------------------------------------------------------
diff --git a/server/src/com/cloud/agent/manager/ClusteredAgentManagerImpl.java b/server/src/com/cloud/agent/manager/ClusteredAgentManagerImpl.java
index 19f0102..4fdb3c6 100755
--- a/server/src/com/cloud/agent/manager/ClusteredAgentManagerImpl.java
+++ b/server/src/com/cloud/agent/manager/ClusteredAgentManagerImpl.java
@@ -99,6 +99,7 @@ public class ClusteredAgentManagerImpl extends AgentManagerImpl implements
Clust
     public final static long SCAN_INTERVAL = 90000; // 90 seconds, it takes 60 sec for xenserver
to fail login
     public final static int ACQUIRE_GLOBAL_LOCK_TIMEOUT_FOR_COOPERATION = 5; // 5 seconds
     public long _loadSize = 100;
+    protected int _directAgentScanInterval = 90; // 90 seconds
     protected Set<Long> _agentToTransferIds = new HashSet<Long>();
 
     @Inject
@@ -134,6 +135,9 @@ public class ClusteredAgentManagerImpl extends AgentManagerImpl implements
Clust
         String value = params.get(Config.DirectAgentLoadSize.key());
         _loadSize = NumbersUtil.parseInt(value, 16);
 
+        value = params.get(Config.DirectAgentScanInterval.key());
+        _directAgentScanInterval = NumbersUtil.parseInt(value, 90); // defaulted to 90 seconds
+
         ClusteredAgentAttache.initialize(this);
 
         _clusterMgr.registerListener(this);
@@ -146,7 +150,10 @@ public class ClusteredAgentManagerImpl extends AgentManagerImpl implements
Clust
         if (!super.start()) {
             return false;
         }
-        _timer.schedule(new DirectAgentScanTimerTask(), STARTUP_DELAY, SCAN_INTERVAL);
+        _timer.schedule(new DirectAgentScanTimerTask(), STARTUP_DELAY, _directAgentScanInterval
* 1000);
+        if (s_logger.isDebugEnabled()) {
+            s_logger.debug("Scheduled direct agent scan task to run at an interval of " +
_directAgentScanInterval + " seconds");
+        }
 
         // schedule transfer scan executor - if agent LB is enabled
         if (_clusterMgr.isAgentRebalanceEnabled()) {

http://git-wip-us.apache.org/repos/asf/cloudstack/blob/a81cc8a1/server/src/com/cloud/configuration/Config.java
----------------------------------------------------------------------
diff --git a/server/src/com/cloud/configuration/Config.java b/server/src/com/cloud/configuration/Config.java
index 1a2c620..d3ed718 100755
--- a/server/src/com/cloud/configuration/Config.java
+++ b/server/src/com/cloud/configuration/Config.java
@@ -356,6 +356,7 @@ public enum Config {
 
 	ResourceCountCheckInterval("Advanced", ManagementServer.class, Long.class, "resourcecount.check.interval",
"0", "Time (in seconds) to wait before retrying resource count check task. Default is 0 which
is to never run the task", "Seconds"),
 	DirectAgentLoadSize("Advanced", ManagementServer.class, Integer.class, "direct.agent.load.size",
"16", "The number of direct agents to load each time", null),
+    DirectAgentScanInterval("Advanced", ManagementServer.class, Integer.class, "direct.agent.scan.interval",
"90", "Time interval (in seconds) to run the direct agent scan task", null),
 
 	//disabling lb as cluster sync does not work with distributed cluster
 	AgentLbEnable("Advanced", ManagementServer.class, Boolean.class, "agent.lb.enabled", "false",
"If agent load balancing enabled in cluster setup", null),

http://git-wip-us.apache.org/repos/asf/cloudstack/blob/a81cc8a1/setup/db/db/schema-410to420.sql
----------------------------------------------------------------------
diff --git a/setup/db/db/schema-410to420.sql b/setup/db/db/schema-410to420.sql
index 69b17ee..5edf733 100644
--- a/setup/db/db/schema-410to420.sql
+++ b/setup/db/db/schema-410to420.sql
@@ -2152,6 +2152,8 @@ INSERT IGNORE INTO `cloud`.`configuration` VALUES ('Advanced', 'DEFAULT',
'manag
 INSERT IGNORE INTO `cloud`.`configuration` VALUES ('Advanced', 'DEFAULT', 'management-server',
'execute.in.sequence.hypervisor.commands', 'false', 'If set to true, StartCommand, StopCommand,
CopyVolumeCommand, CreateCommand will be synchronized on the agent side. If set to false,
these commands become asynchronous. Default value is false.');
 INSERT IGNORE INTO `cloud`.`configuration` VALUES ('Advanced', 'DEFAULT', 'management-server',
'execute.in.sequence.network.element.commands', 'false', 'If set to true, DhcpEntryCommand,
SavePasswordCommand, UserDataCommand, VmDataCommand will be synchronized on the agent side.
If set to false, these commands become asynchronous. Default value is false.');
 
+INSERT IGNORE INTO `cloud`.`configuration` VALUES ('Advanced', 'DEFAULT', 'management-server',
'direct.agent.scan.interval', 90, 'Time interval (in seconds) to run the direct agent scan
task.');
+
 ALTER TABLE `cloud`.`vm_template` ADD COLUMN `dynamically_scalable` tinyint(1) unsigned NOT
NULL DEFAULT 0  COMMENT 'true if template contains XS/VMWare tools inorder to support dynamic
scaling of VM cpu/memory';
 UPDATE `cloud`.`vm_template` SET dynamically_scalable = 1 WHERE name = "CentOS 5.6(64-bit)
no GUI (XenServer)" AND type = "BUILTIN";
 


Mime
View raw message