ambari-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jonathanhur...@apache.org
Subject ambari git commit: AMBARI-15896 - RU/EU: Unable to start upgrade with host in MM (jonathanhurley)
Date Fri, 15 Apr 2016 14:51:46 GMT
Repository: ambari
Updated Branches:
  refs/heads/branch-2.2.2 dfc13185b -> e2634cde5


AMBARI-15896 - RU/EU: Unable to start upgrade with host in MM (jonathanhurley)


Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/e2634cde
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/e2634cde
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/e2634cde

Branch: refs/heads/branch-2.2.2
Commit: e2634cde5c30db851cb5c7521aec5bf67c21d11d
Parents: dfc1318
Author: Jonathan Hurley <jhurley@hortonworks.com>
Authored: Thu Apr 14 16:11:02 2016 -0400
Committer: Jonathan Hurley <jhurley@hortonworks.com>
Committed: Fri Apr 15 10:51:40 2016 -0400

----------------------------------------------------------------------
 .../ambari/server/checks/ServicesUpCheck.java   | 200 +++++++++++++------
 .../server/orm/models/HostComponentSummary.java |  15 +-
 .../server/state/cluster/ClustersImpl.java      |   4 +-
 .../server/checks/ServicesUpCheckTest.java      |  48 +++++
 4 files changed, 197 insertions(+), 70 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ambari/blob/e2634cde/ambari-server/src/main/java/org/apache/ambari/server/checks/ServicesUpCheck.java
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/java/org/apache/ambari/server/checks/ServicesUpCheck.java
b/ambari-server/src/main/java/org/apache/ambari/server/checks/ServicesUpCheck.java
index ea8569c..d4c7894 100644
--- a/ambari-server/src/main/java/org/apache/ambari/server/checks/ServicesUpCheck.java
+++ b/ambari-server/src/main/java/org/apache/ambari/server/checks/ServicesUpCheck.java
@@ -30,6 +30,8 @@ import org.apache.ambari.server.controller.PrereqCheckRequest;
 import org.apache.ambari.server.orm.models.HostComponentSummary;
 import org.apache.ambari.server.state.Cluster;
 import org.apache.ambari.server.state.ComponentInfo;
+import org.apache.ambari.server.state.Host;
+import org.apache.ambari.server.state.MaintenanceState;
 import org.apache.ambari.server.state.Service;
 import org.apache.ambari.server.state.ServiceComponent;
 import org.apache.ambari.server.state.StackId;
@@ -41,7 +43,24 @@ import org.apache.commons.lang.StringUtils;
 import com.google.inject.Singleton;
 
 /**
- * Checks that services are up.
+ * The {@link ServicesUpCheck} class is used to ensure that services are up
+ * "enough" before an upgrade begins. This class uses the following rules:
+ * <ul>
+ * <li>If the component is a CLIENT, it will skip it</li>
+ * <li>If the component is a MASTER then it must be online and not in MM</li>
+ * <li>If the component is a SLAVE
+ * <ul>
+ * <li>If the cardinality is 1+, then determine if {@value #SLAVE_THRESHOLD} %
+ * are running. Hosts in MM are counted as being "up" since they are not part of
+ * the upgrade.</li>
+ * <li>If the cardinality is 0, then all instances must be online. Hosts in MM
+ * are counted as being "up" since they are not part of the upgrade.</li>
+ * </ul>
+ * </ul>
+ * It seems counter-intuitive to have a liveliness check which allows a
+ * percentage of the slaves to be down. The goal is to be able to start an
+ * upgrade, even if some slave components on healthy hosts are down. We still
+ * want hosts to be scehdule for upgrade of their other components.
  */
 @Singleton
 @UpgradeCheck(group = UpgradeCheckGroup.LIVELINESS, order = 2.0f, required = true)
@@ -56,8 +75,13 @@ public class ServicesUpCheck extends AbstractCheckDescriptor {
     super(CheckDescription.SERVICES_UP);
   }
 
+  /**
+   * {@inheritDoc}
+   */
   @Override
-  public void perform(PrerequisiteCheck prerequisiteCheck, PrereqCheckRequest request) throws
AmbariException {
+  public void perform(PrerequisiteCheck prerequisiteCheck, PrereqCheckRequest request)
+      throws AmbariException {
+
     final String clusterName = request.getClusterName();
     final Cluster cluster = clustersProvider.get().getCluster(clusterName);
     List<String> errorMessages = new ArrayList<String>();
@@ -69,74 +93,84 @@ public class ServicesUpCheck extends AbstractCheckDescriptor {
       final Service service = serviceEntry.getValue();
 
       // Ignore services like Tez that are clientOnly.
-      if (!service.isClientOnlyService()) {
-        Map<String, ServiceComponent> serviceComponents = service.getServiceComponents();
+      if (service.isClientOnlyService()) {
+        continue;
+      }
 
-        for (Map.Entry<String, ServiceComponent> component : serviceComponents.entrySet())
{
+      Map<String, ServiceComponent> serviceComponents = service.getServiceComponents();
+      for (Map.Entry<String, ServiceComponent> component : serviceComponents.entrySet())
{
 
-          boolean ignoreComponent = false;
-          boolean checkThreshold = false;
+        ServiceComponent serviceComponent = component.getValue();
+        // In Services like HDFS, ignore components like HDFS Client
+        if (serviceComponent.isClientComponent()) {
+          continue;
+        }
 
-          ServiceComponent serviceComponent = component.getValue();
-          // In Services like HDFS, ignore components like HDFS Client
-          if (serviceComponent.isClientComponent()) {
-            ignoreComponent = true;
+        // skip if the component is not part of the finalization version check
+        if (!serviceComponent.isVersionAdvertised()) {
+          continue;
+        }
+
+        // TODO, add more logic that checks the Upgrade Pack.
+        // These components are not in the upgrade pack and do not advertise a
+        // version:
+        // ZKFC, Ambari Metrics, Kerberos, Atlas (right now).
+        // Generally, if it advertises a version => in the upgrade pack.
+        // So it can be in the Upgrade Pack but not advertise a version.
+        List<HostComponentSummary> hostComponentSummaries = HostComponentSummary.getHostComponentSummaries(
+            service.getName(), serviceComponent.getName());
+
+        // not installed, nothing to do
+        if (hostComponentSummaries.isEmpty()) {
+          continue;
+        }
+
+        // non-master, "true" slaves with cardinality 1+
+        boolean checkThreshold = false;
+        if (!serviceComponent.isMasterComponent()) {
+          ComponentInfo componentInfo = ambariMetaInfo.get().getComponent(stackId.getStackName(),
+              stackId.getStackVersion(), serviceComponent.getServiceName(),
+              serviceComponent.getName());
+
+          String cardinality = componentInfo.getCardinality();
+          // !!! check if there can be more than one. This will match, say,
+          // datanodes but not ZKFC
+          if (null != cardinality
+              && (cardinality.equals("ALL") || cardinality.matches("[1-9].*"))) {
+            checkThreshold = true;
           }
+        }
+
+        // check threshold for slaves which have a non-0 cardinality
+        if (checkThreshold) {
+          int total = hostComponentSummaries.size();
+          int up = 0;
+          int down = 0;
 
-          // non-master, "true" slaves with cardinality 1+
-          if (!ignoreComponent && !serviceComponent.isMasterComponent()) {
-            ComponentInfo componentInfo = ambariMetaInfo.get().getComponent(
-                stackId.getStackName(), stackId.getStackVersion(),
-                serviceComponent.getServiceName(), serviceComponent.getName());
-
-            String cardinality = componentInfo.getCardinality();
-            // !!! check if there can be more than one. This will match, say, datanodes but
not ZKFC
-            if (null != cardinality &&
-                (cardinality.equals("ALL") || cardinality.matches("[1-9].*"))) {
-              checkThreshold = true;
+          for (HostComponentSummary summary : hostComponentSummaries) {
+            if (isConsideredDown(cluster, serviceComponent, summary)) {
+              down++;
+            } else {
+              up++;
             }
           }
 
-          if (!serviceComponent.isVersionAdvertised()) {
-            ignoreComponent = true;
+          if ((float) down / total > SLAVE_THRESHOLD) { // arbitrary
+            failedServiceNames.add(service.getName());
+            String message = MessageFormat.format(
+                "{0}: {1} out of {2} {3} are started; there should be {4,number,percent}
started before upgrading.",
+                service.getName(), up, total, serviceComponent.getName(), SLAVE_THRESHOLD);
+            errorMessages.add(message);
           }
-
-          // TODO, add more logic that checks the Upgrade Pack.
-          // These components are not in the upgrade pack and do not advertise a version:
-          // ZKFC, Ambari Metrics, Kerberos, Atlas (right now).
-          // Generally, if it advertises a version => in the upgrade pack.
-          // So it can be in the Upgrade Pack but not advertise a version.
-          if (!ignoreComponent) {
-            List<HostComponentSummary> hostComponentSummaries = HostComponentSummary.getHostComponentSummaries(service.getName(),
serviceComponent.getName());
-
-            if (checkThreshold && !hostComponentSummaries.isEmpty()) {
-              int total = hostComponentSummaries.size();
-              int up = 0;
-              int down = 0;
-
-              for(HostComponentSummary s : hostComponentSummaries) {
-                if ((s.getDesiredState() == State.INSTALLED || s.getDesiredState() == State.STARTED)
&& State.STARTED != s.getCurrentState()) {
-                  down++;
-                } else {
-                  up++;
-                }
-              }
-
-              if ((float) down/total > SLAVE_THRESHOLD) { // arbitrary
-                failedServiceNames.add(service.getName());
-                String message = MessageFormat.format("{0}: {1} out of {2} {3} are started;
there should be {4,number,percent} started before upgrading.",
-                    service.getName(), up, total, serviceComponent.getName(), SLAVE_THRESHOLD);
-                errorMessages.add(message);
-              }
-            } else {
-              for(HostComponentSummary s : hostComponentSummaries) {
-                if ((s.getDesiredState() == State.INSTALLED || s.getDesiredState() == State.STARTED)
&& State.STARTED != s.getCurrentState()) {
-                  failedServiceNames.add(service.getName());
-                  String message = MessageFormat.format("{0}: {1} (in {2} on host {3})",
service.getName(), serviceComponent.getName(), s.getCurrentState(), s.getHostName());
-                  errorMessages.add(message);
-                  break;
-                }
-              }
+        } else {
+          for (HostComponentSummary summary : hostComponentSummaries) {
+            if (isConsideredDown(cluster, serviceComponent, summary)) {
+              failedServiceNames.add(service.getName());
+              String message = MessageFormat.format("{0}: {1} (in {2} on host {3})",
+                  service.getName(), serviceComponent.getName(), summary.getCurrentState(),
+                  summary.getHostName());
+              errorMessages.add(message);
+              break;
             }
           }
         }
@@ -146,8 +180,48 @@ public class ServicesUpCheck extends AbstractCheckDescriptor {
     if (!errorMessages.isEmpty()) {
       prerequisiteCheck.setFailedOn(new LinkedHashSet<String>(failedServiceNames));
       prerequisiteCheck.setStatus(PrereqCheckStatus.FAIL);
-      prerequisiteCheck.setFailReason("The following Service Components should be in a started
state.  Please invoke a service Stop and full Start and try again. " + StringUtils.join(errorMessages,
", "));
+      prerequisiteCheck.setFailReason(
+          "The following Service Components should be in a started state.  Please invoke
a service Stop and full Start and try again. "
+              + StringUtils.join(errorMessages, ", "));
     }
   }
 
-}
\ No newline at end of file
+  /**
+   * Gets whether this component should be considered as being "down" for the
+   * purposes of this check. Component type, maintenance mode, and state are
+   * taken into account.
+   *
+   * @param clusters
+   *          the clusters instance
+   * @param cluster
+   *          the cluster
+   * @param serviceComponent
+   *          the component
+   * @param summary
+   *          a summary of the state of the component on a host
+   * @return {@code true} if the host component should be considered as failing
+   *         this test.
+   * @throws AmbariException
+   */
+  private boolean isConsideredDown(Cluster cluster, ServiceComponent serviceComponent,
+      HostComponentSummary summary) throws AmbariException {
+    Host host = clustersProvider.get().getHostById(summary.getHostId());
+    MaintenanceState maintenanceState = host.getMaintenanceState(cluster.getClusterId());
+
+    // non-MASTER components in maintenance mode should not count
+    if (maintenanceState == MaintenanceState.ON && !serviceComponent.isMasterComponent())
{
+      return false;
+    }
+
+    State desiredState = summary.getDesiredState();
+    State currentState = summary.getCurrentState();
+
+    switch (desiredState) {
+      case INSTALLED:
+      case STARTED:
+        return currentState != State.STARTED;
+      default:
+        return false;
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/ambari/blob/e2634cde/ambari-server/src/main/java/org/apache/ambari/server/orm/models/HostComponentSummary.java
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/java/org/apache/ambari/server/orm/models/HostComponentSummary.java
b/ambari-server/src/main/java/org/apache/ambari/server/orm/models/HostComponentSummary.java
index c79ec8a..cdc06fe 100644
--- a/ambari-server/src/main/java/org/apache/ambari/server/orm/models/HostComponentSummary.java
+++ b/ambari-server/src/main/java/org/apache/ambari/server/orm/models/HostComponentSummary.java
@@ -18,7 +18,9 @@
 
 package org.apache.ambari.server.orm.models;
 
-import com.google.inject.Inject;
+import java.util.ArrayList;
+import java.util.List;
+
 import org.apache.ambari.server.StaticallyInject;
 import org.apache.ambari.server.orm.dao.HostComponentDesiredStateDAO;
 import org.apache.ambari.server.orm.dao.HostComponentStateDAO;
@@ -28,8 +30,7 @@ import org.apache.ambari.server.orm.entities.HostComponentStateEntity;
 import org.apache.ambari.server.orm.entities.HostEntity;
 import org.apache.ambari.server.state.State;
 
-import java.util.ArrayList;
-import java.util.List;
+import com.google.inject.Inject;
 
 @StaticallyInject
 public class HostComponentSummary {
@@ -56,15 +57,19 @@ public class HostComponentSummary {
 
     HostEntity host = hostDao.findById(hostId);
     if (host != null) {
-      this.hostName = host.getHostName();
+      hostName = host.getHostName();
     }
 
     this.desiredState = desiredState;
     this.currentState = currentState;
   }
 
+  public long getHostId() {
+    return hostId;
+  }
+
   public String getHostName() {
-    return (this.hostName == null || this.hostName.isEmpty()) ? "" : this.hostName;
+    return (hostName == null || hostName.isEmpty()) ? "" : hostName;
   }
 
   public State getDesiredState() {

http://git-wip-us.apache.org/repos/asf/ambari/blob/e2634cde/ambari-server/src/main/java/org/apache/ambari/server/state/cluster/ClustersImpl.java
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/java/org/apache/ambari/server/state/cluster/ClustersImpl.java
b/ambari-server/src/main/java/org/apache/ambari/server/state/cluster/ClustersImpl.java
index 03068fc..b678f9cf 100644
--- a/ambari-server/src/main/java/org/apache/ambari/server/state/cluster/ClustersImpl.java
+++ b/ambari-server/src/main/java/org/apache/ambari/server/state/cluster/ClustersImpl.java
@@ -400,11 +400,11 @@ public class ClustersImpl implements Clusters {
   public Host getHostById(Long hostId) throws AmbariException {
     checkLoaded();
 
-    if (!hosts.containsKey(hostId)) {
+    if (!hostsById.containsKey(hostId)) {
       throw new HostNotFoundException("Host Id = " + hostId);
     }
 
-    return hosts.get(hostId);
+    return hostsById.get(hostId);
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/ambari/blob/e2634cde/ambari-server/src/test/java/org/apache/ambari/server/checks/ServicesUpCheckTest.java
----------------------------------------------------------------------
diff --git a/ambari-server/src/test/java/org/apache/ambari/server/checks/ServicesUpCheckTest.java
b/ambari-server/src/test/java/org/apache/ambari/server/checks/ServicesUpCheckTest.java
index 67b801c..3910a0d 100644
--- a/ambari-server/src/test/java/org/apache/ambari/server/checks/ServicesUpCheckTest.java
+++ b/ambari-server/src/test/java/org/apache/ambari/server/checks/ServicesUpCheckTest.java
@@ -30,6 +30,8 @@ import org.apache.ambari.server.orm.models.HostComponentSummary;
 import org.apache.ambari.server.state.Cluster;
 import org.apache.ambari.server.state.Clusters;
 import org.apache.ambari.server.state.ComponentInfo;
+import org.apache.ambari.server.state.Host;
+import org.apache.ambari.server.state.MaintenanceState;
 import org.apache.ambari.server.state.Service;
 import org.apache.ambari.server.state.ServiceComponent;
 import org.apache.ambari.server.state.StackId;
@@ -95,11 +97,23 @@ public class ServicesUpCheckTest {
     };
 
 
+    Host host1 = Mockito.mock(Host.class);
+    Host host2 = Mockito.mock(Host.class);
+    Host host3 = Mockito.mock(Host.class);
+
+    Mockito.when(host1.getMaintenanceState(Mockito.anyLong())).thenReturn(MaintenanceState.OFF);
+    Mockito.when(host2.getMaintenanceState(Mockito.anyLong())).thenReturn(MaintenanceState.OFF);
+    Mockito.when(host3.getMaintenanceState(Mockito.anyLong())).thenReturn(MaintenanceState.OFF);
+
     final Cluster cluster = Mockito.mock(Cluster.class);
     Mockito.when(cluster.getClusterId()).thenReturn(1L);
     Mockito.when(cluster.getCurrentStackVersion()).thenReturn(new StackId("HDP", "2.2"));
     Mockito.when(clusters.getCluster("cluster")).thenReturn(cluster);
 
+    Mockito.when(clusters.getHostById(Long.valueOf(1))).thenReturn(host1);
+    Mockito.when(clusters.getHostById(Long.valueOf(2))).thenReturn(host2);
+    Mockito.when(clusters.getHostById(Long.valueOf(3))).thenReturn(host3);
+
     final Service hdfsService = Mockito.mock(Service.class);
     final Service tezService = Mockito.mock(Service.class);
     final Service amsService = Mockito.mock(Service.class);
@@ -201,6 +215,16 @@ public class ServicesUpCheckTest {
     final HostComponentSummary hcsMetricsCollector = Mockito.mock(HostComponentSummary.class);
     final HostComponentSummary hcsMetricsMonitor = Mockito.mock(HostComponentSummary.class);
 
+    // set hosts for the summaries
+    Mockito.when(hcsNameNode.getHostId()).thenReturn(Long.valueOf(1));
+    Mockito.when(hcsDataNode1.getHostId()).thenReturn(Long.valueOf(1));
+    Mockito.when(hcsDataNode2.getHostId()).thenReturn(Long.valueOf(2));
+    Mockito.when(hcsDataNode3.getHostId()).thenReturn(Long.valueOf(3));
+    Mockito.when(hcsZKFC.getHostId()).thenReturn(Long.valueOf(1));
+    Mockito.when(hcsTezClient.getHostId()).thenReturn(Long.valueOf(2));
+    Mockito.when(hcsMetricsCollector.getHostId()).thenReturn(Long.valueOf(1));
+    Mockito.when(hcsMetricsMonitor.getHostId()).thenReturn(Long.valueOf(1));
+
     List<HostComponentSummary> allHostComponentSummaries = new ArrayList<HostComponentSummary>();
     allHostComponentSummaries.add(hcsNameNode);
     allHostComponentSummaries.add(hcsDataNode1);
@@ -268,5 +292,29 @@ public class ServicesUpCheckTest {
     servicesUpCheck.perform(check, new PrereqCheckRequest("cluster"));
     Assert.assertEquals(PrereqCheckStatus.FAIL, check.getStatus());
     Assert.assertTrue(check.getFailReason().indexOf("50%") > -1);
+
+    // place the DN slaves into MM which will allow them to be skipped
+    Mockito.when(host1.getMaintenanceState(Mockito.anyLong())).thenReturn(MaintenanceState.ON);
+    Mockito.when(host3.getMaintenanceState(Mockito.anyLong())).thenReturn(MaintenanceState.ON);
+    check = new PrerequisiteCheck(null, null);
+    servicesUpCheck.perform(check, new PrereqCheckRequest("cluster"));
+    Assert.assertEquals(PrereqCheckStatus.PASS, check.getStatus());
+
+    // put everything back to normal, then fail NN
+    Mockito.when(hcsNameNode.getCurrentState()).thenReturn(State.INSTALLED);
+    Mockito.when(hcsDataNode1.getCurrentState()).thenReturn(State.STARTED);
+    Mockito.when(hcsDataNode2.getCurrentState()).thenReturn(State.STARTED);
+    Mockito.when(host1.getMaintenanceState(Mockito.anyLong())).thenReturn(MaintenanceState.OFF);
+    Mockito.when(host3.getMaintenanceState(Mockito.anyLong())).thenReturn(MaintenanceState.OFF);
+
+    check = new PrerequisiteCheck(null, null);
+    servicesUpCheck.perform(check, new PrereqCheckRequest("cluster"));
+    Assert.assertEquals(PrereqCheckStatus.FAIL, check.getStatus());
+
+    // put NN into MM; should still fail since it's a master
+    Mockito.when(host1.getMaintenanceState(Mockito.anyLong())).thenReturn(MaintenanceState.ON);
+    check = new PrerequisiteCheck(null, null);
+    servicesUpCheck.perform(check, new PrereqCheckRequest("cluster"));
+    Assert.assertEquals(PrereqCheckStatus.FAIL, check.getStatus());
   }
 }


Mime
View raw message