ambari-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From alejan...@apache.org
Subject ambari git commit: AMBARI-15446. Auto-retry on failure during RU/EU (alejandro)
Date Mon, 21 Mar 2016 21:00:30 GMT
Repository: ambari
Updated Branches:
  refs/heads/trunk 0521787d8 -> 79b9e570f


AMBARI-15446. Auto-retry on failure during RU/EU (alejandro)


Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/79b9e570
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/79b9e570
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/79b9e570

Branch: refs/heads/trunk
Commit: 79b9e570fe1549bfc312dc02c29246c56043face
Parents: 0521787
Author: Alejandro Fernandez <afernandez@hortonworks.com>
Authored: Mon Mar 21 11:54:39 2016 -0700
Committer: Alejandro Fernandez <afernandez@hortonworks.com>
Committed: Mon Mar 21 14:00:19 2016 -0700

----------------------------------------------------------------------
 .../actionmanager/ActionDBAccessorImpl.java     |   5 +
 .../server/actionmanager/HostRoleCommand.java   |  12 +
 .../ambari/server/agent/HeartbeatProcessor.java |   5 +
 .../server/configuration/Configuration.java     | 109 +++++++
 .../server/orm/dao/HostRoleCommandDAO.java      |  10 +
 .../orm/entities/HostRoleCommandEntity.java     |  28 ++
 .../services/RetryUpgradeActionService.java     | 269 ++++++++++++++++++
 .../ambari/server/topology/HostRequest.java     |   2 +
 .../ambari/server/topology/LogicalRequest.java  |   2 +-
 .../server/upgrade/UpgradeCatalog240.java       |  15 +
 .../main/resources/Ambari-DDL-MySQL-CREATE.sql  |   1 +
 .../main/resources/Ambari-DDL-Oracle-CREATE.sql |   1 +
 .../resources/Ambari-DDL-Postgres-CREATE.sql    |   1 +
 .../Ambari-DDL-Postgres-EMBEDDED-CREATE.sql     |   1 +
 .../resources/Ambari-DDL-SQLAnywhere-CREATE.sql |   1 +
 .../resources/Ambari-DDL-SQLServer-CREATE.sql   |   1 +
 .../services/RetryUpgradeActionServiceTest.java | 284 +++++++++++++++++++
 17 files changed, 746 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionDBAccessorImpl.java
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionDBAccessorImpl.java b/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionDBAccessorImpl.java
index 429f573..a75a000 100644
--- a/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionDBAccessorImpl.java
+++ b/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionDBAccessorImpl.java
@@ -601,6 +601,9 @@ public class ActionDBAccessorImpl implements ActionDBAccessor {
     HostRoleCommandEntity entity = hostRoleCommandDAO.findByPK(hostRoleCommand.getTaskId());
     if (entity != null) {
       entity.setStartTime(hostRoleCommand.getStartTime());
+      if (entity.getOriginalStartTime() == null || entity.getOriginalStartTime() == -1) {
+        entity.setOriginalStartTime(System.currentTimeMillis());
+      }
       entity.setLastAttemptTime(hostRoleCommand.getLastAttemptTime());
       entity.setStatus(hostRoleCommand.getStatus());
       entity.setAttemptCount(hostRoleCommand.getAttemptCount());
@@ -738,6 +741,8 @@ public class ActionDBAccessorImpl implements ActionDBAccessor {
     List<HostRoleCommandEntity> tasks = hostRoleCommandDAO.findByPKs(taskIds);
     for (HostRoleCommandEntity task : tasks) {
       task.setStatus(HostRoleStatus.PENDING);
+      // TODO HACK, shouldn't reset start time.
+      // Because it expects -1, RetryActionMonitor.java also had to set it to -1.
       task.setStartTime(-1L);
       task.setEndTime(-1L);
     }

http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/HostRoleCommand.java
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/HostRoleCommand.java b/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/HostRoleCommand.java
index 2764b3f..83fa6b9 100644
--- a/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/HostRoleCommand.java
+++ b/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/HostRoleCommand.java
@@ -58,6 +58,7 @@ public class HostRoleCommand {
   private String structuredOut = "";
   private int exitCode = 999; //Default is unknown
   private long startTime = -1;
+  private long originalStartTime = -1;
   private long endTime = -1;
   private long lastAttemptTime = -1;
   private short attemptCount = 0;
@@ -160,6 +161,7 @@ public class HostRoleCommand {
     structuredOut = hostRoleCommandEntity.getStructuredOut() != null ? new String(hostRoleCommandEntity.getStructuredOut()) : "";
     exitCode = hostRoleCommandEntity.getExitcode();
     startTime = hostRoleCommandEntity.getStartTime();
+    originalStartTime = hostRoleCommandEntity.getOriginalStartTime();
     endTime = hostRoleCommandEntity.getEndTime() != null ? hostRoleCommandEntity.getEndTime() : -1L;
     lastAttemptTime = hostRoleCommandEntity.getLastAttemptTime();
     attemptCount = hostRoleCommandEntity.getAttemptCount();
@@ -182,6 +184,7 @@ public class HostRoleCommand {
     hostRoleCommandEntity.setStdOut(stdout.getBytes());
     hostRoleCommandEntity.setStructuredOut(structuredOut.getBytes());
     hostRoleCommandEntity.setStartTime(startTime);
+    hostRoleCommandEntity.setOriginalStartTime(originalStartTime);
     hostRoleCommandEntity.setEndTime(endTime);
     hostRoleCommandEntity.setLastAttemptTime(lastAttemptTime);
     hostRoleCommandEntity.setAttemptCount(attemptCount);
@@ -328,6 +331,14 @@ public class HostRoleCommand {
     this.startTime = startTime;
   }
 
+  public long getOriginalStartTime() {
+    return originalStartTime;
+  }
+
+  public void setOriginalStartTime(long originalStartTime) {
+    this.originalStartTime = originalStartTime;
+  }
+
   public long getLastAttemptTime() {
     return lastAttemptTime;
   }
@@ -429,6 +440,7 @@ public class HostRoleCommand {
     builder.append("  stderr: ").append(stderr).append("\n");
     builder.append("  exitcode: ").append(exitCode).append("\n");
     builder.append("  Start time: ").append(startTime).append("\n");
+    builder.append("  Original Start time: ").append(originalStartTime).append("\n");
     builder.append("  Last attempt time: ").append(lastAttemptTime).append("\n");
     builder.append("  attempt count: ").append(attemptCount).append("\n");
     return builder.toString();

http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartbeatProcessor.java
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartbeatProcessor.java b/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartbeatProcessor.java
index a1a686a..7e57031 100644
--- a/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartbeatProcessor.java
+++ b/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartbeatProcessor.java
@@ -403,6 +403,11 @@ public class HeartbeatProcessor extends AbstractService{
       if (hostRoleCommand.getStatus() == HostRoleStatus.QUEUED &&
           report.getStatus().equals("IN_PROGRESS")) {
         hostRoleCommand.setStartTime(now);
+
+        // Because the task may be retried several times, set the original start time only once.
+        if (hostRoleCommand.getOriginalStartTime() == -1) {
+          hostRoleCommand.setOriginalStartTime(now);
+        }
       }
 
       // If the report indicates the keytab file was successfully transferred to a host or removed

http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/java/org/apache/ambari/server/configuration/Configuration.java
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/java/org/apache/ambari/server/configuration/Configuration.java b/ambari-server/src/main/java/org/apache/ambari/server/configuration/Configuration.java
index 9404506..bf18325 100644
--- a/ambari-server/src/main/java/org/apache/ambari/server/configuration/Configuration.java
+++ b/ambari-server/src/main/java/org/apache/ambari/server/configuration/Configuration.java
@@ -39,6 +39,7 @@ import org.apache.ambari.server.state.stack.OsFamily;
 import org.apache.ambari.server.utils.AmbariPath;
 import org.apache.ambari.server.utils.Parallel;
 import org.apache.ambari.server.utils.ShellCommandUtil;
+import org.apache.commons.collections.ListUtils;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.lang.RandomStringUtils;
 import org.apache.commons.lang.StringUtils;
@@ -210,9 +211,41 @@ public class Configuration {
   // Properties for stack upgrade (Rolling, Express)
   public static final String ROLLING_UPGRADE_SKIP_PACKAGES_PREFIXES_KEY = "rolling.upgrade.skip.packages.prefixes";
   public static final String ROLLING_UPGRADE_SKIP_PACKAGES_PREFIXES_DEFAULT = "";
+
   public static final String STACK_UPGRADE_BYPASS_PRECHECKS_KEY = "stack.upgrade.bypass.prechecks";
   public static final String STACK_UPGRADE_BYPASS_PRECHECKS_DEFAULT = "false";
 
+  /**
+   * If a host is shutdown or ambari-agent is stopped, then Ambari Server will still keep waiting til the task timesout,
+   * say 10-20 mins. If the host comes back online and ambari-agent is started, then need this retry property
+   * to be greater; ideally, it should be greater than 2 * command_timeout in order to retry at least
+   * 3 times in that amount of mins.
+   * Suggested value is 15-30 mins.
+   */
+  public static final String STACK_UPGRADE_AUTO_RETRY_TIMEOUT_MINS_KEY = "stack.upgrade.auto.retry.timeout.mins";
+  public static final String STACK_UPGRADE_AUTO_RETRY_TIMEOUT_MINS_DEFAULT = "0";
+
+  /**
+   * If the stack.upgrade.auto.retry.timeout.mins property is positive, then run RetryUpgradeActionService every x
+   * seconds.
+   */
+  public static final String STACK_UPGRADE_AUTO_RETRY_CHECK_INTERVAL_SECS_KEY = "stack.upgrade.auto.retry.check.interval.secs";
+  public static final String STACK_UPGRADE_AUTO_RETRY_CHECK_INTERVAL_SECS_DEFAULT = "20";
+
+  /**
+   * If auto-retry during stack upgrade is enabled, skip any tasks whose custom command name contains at least one
+   * of the strings in the following CSV property. Note that values have to be enclosed in quotes and separated by commas.
+   */
+  public static final String STACK_UPGRADE_AUTO_RETRY_CUSTOM_COMMAND_NAMES_TO_IGNORE_KEY = "stack.upgrade.auto.retry.command.names.to.ignore";
+  public static final String STACK_UPGRADE_AUTO_RETRY_CUSTOM_COMMAND_NAMES_TO_IGNORE_DEFAULT = "\"ComponentVersionCheckAction\",\"FinalizeUpgradeAction\"";
+
+  /**
+   * If auto-retry during stack upgrade is enabled, skip any tasks whose command details contains at least one
+   * of the strings in the following CSV property. Note that values have to be enclosed in quotes and separated by commas.
+   */
+  public static final String STACK_UPGRADE_AUTO_RETRY_COMMAND_DETAILS_TO_IGNORE_KEY = "stack.upgrade.auto.retry.command.details.to.ignore";
+  public static final String STACK_UPGRADE_AUTO_RETRY_COMMAND_DETAILS_TO_IGNORE_DEFAULT = "\"Execute HDFS Finalize\"";
+
   public static final String JWT_AUTH_ENBABLED = "authentication.jwt.enabled";
   public static final String JWT_AUTH_PROVIDER_URL = "authentication.jwt.providerUrl";
   public static final String JWT_PUBLIC_KEY = "authentication.jwt.publicKey";
@@ -1110,6 +1143,82 @@ public class Configuration {
   }
 
   /**
+   * During stack upgrade, can auto-retry failures for up to x mins. This is useful to improve the robustness in unstable environments.
+   * Suggested value is 0-30 mins.
+   * @return
+   */
+  public int getStackUpgradeAutoRetryTimeoutMins() {
+    Integer result = NumberUtils.toInt(properties.getProperty(STACK_UPGRADE_AUTO_RETRY_TIMEOUT_MINS_KEY, STACK_UPGRADE_AUTO_RETRY_TIMEOUT_MINS_DEFAULT));
+    return result >= 0 ? result : 0;
+  }
+
+  /**
+   * If the stack.upgrade.auto.retry.timeout.mins property is positive, then run RetryUpgradeActionService every x
+   * seconds.
+   * @return Number of seconds between runs of {@link org.apache.ambari.server.state.services.RetryUpgradeActionService}
+   */
+  public int getStackUpgradeAutoRetryCheckIntervalSecs() {
+    Integer result = NumberUtils.toInt(properties.getProperty(STACK_UPGRADE_AUTO_RETRY_CHECK_INTERVAL_SECS_KEY, STACK_UPGRADE_AUTO_RETRY_CHECK_INTERVAL_SECS_DEFAULT));
+    return result >= 0 ? result : 0;
+  }
+
+  /**
+   * If auto-retry during stack upgrade is enabled, skip any tasks whose custom command name contains at least one
+   * of the strings in the following CSV property. Note that values have to be enclosed in quotes and separated by commas.
+   * @return
+   */
+  public List<String> getStackUpgradeAutoRetryCustomCommandNamesToIgnore() {
+    String value = properties.getProperty(STACK_UPGRADE_AUTO_RETRY_CUSTOM_COMMAND_NAMES_TO_IGNORE_KEY, STACK_UPGRADE_AUTO_RETRY_CUSTOM_COMMAND_NAMES_TO_IGNORE_DEFAULT);
+    List<String> list = convertCSVwithQuotesToList(value);
+    listToLowerCase(list);
+    return list;
+  }
+
+  /**
+   * If auto-retry during stack upgrade is enabled, skip any tasks whose command details contains at least one
+   * of the strings in the following CSV property. Note that values have to be enclosed in quotes and separated by commas.
+   * @return
+   */
+  public List<String> getStackUpgradeAutoRetryCommandDetailsToIgnore() {
+    String value = properties.getProperty(STACK_UPGRADE_AUTO_RETRY_COMMAND_DETAILS_TO_IGNORE_KEY, STACK_UPGRADE_AUTO_RETRY_COMMAND_DETAILS_TO_IGNORE_DEFAULT);
+    List<String> list = convertCSVwithQuotesToList(value);
+    listToLowerCase(list);
+    return list;
+  }
+
+  /**
+   * Convert quoted elements separated by commas into a list. Values cannot contain double quotes or commas.
+   * @param value, e.g., String with value "a","b","c" => ["a", "b", "c"]
+   * @return List of parsed values, or empty list if no values exist.
+   */
+  private List<String> convertCSVwithQuotesToList(String value) {
+    List<String> list = new ArrayList<>();
+    if (StringUtils.isNotEmpty(value)) {
+      if (value.indexOf(",") >= 0) {
+        for (String e : value.split(",")) {
+          e = StringUtils.stripStart(e, "\"");
+          e = StringUtils.stripEnd(e, "\"");
+          list.add(e);
+        }
+      } else {
+        list.add(value);
+      }
+    }
+    return list;
+  }
+
+  /**
+   * Convert the elements of a list to lowercase.
+   * @param list
+   */
+  private void listToLowerCase(List<String> list) {
+    if (list == null) return;
+    for (int i = 0; i < list.size(); i++) {
+      list.set(i, list.get(i).toLowerCase());
+    }
+  }
+
+  /**
    * Get the map with server config parameters.
    * Keys - public constants of this class
    * @return the map with server config parameters

http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/java/org/apache/ambari/server/orm/dao/HostRoleCommandDAO.java
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/java/org/apache/ambari/server/orm/dao/HostRoleCommandDAO.java b/ambari-server/src/main/java/org/apache/ambari/server/orm/dao/HostRoleCommandDAO.java
index f5b1cb4..2e35001 100644
--- a/ambari-server/src/main/java/org/apache/ambari/server/orm/dao/HostRoleCommandDAO.java
+++ b/ambari-server/src/main/java/org/apache/ambari/server/orm/dao/HostRoleCommandDAO.java
@@ -313,6 +313,16 @@ public class HostRoleCommandDAO {
   }
 
   @RequiresSession
+  public List<HostRoleCommandEntity> findByRequestIdAndStatuses(Long requestId, Collection<HostRoleStatus> statuses) {
+    TypedQuery<HostRoleCommandEntity> query = entityManagerProvider.get().createNamedQuery(
+        "HostRoleCommandEntity.findByRequestIdAndStatuses", HostRoleCommandEntity.class);
+    query.setParameter("requestId", requestId);
+    query.setParameter("statuses", statuses);
+    List results = query.getResultList();
+    return results;
+  }
+
+  @RequiresSession
   public List<Long> findTaskIdsByRequestIds(Collection<Long> requestIds) {
     TypedQuery<Long> query = entityManagerProvider.get().createQuery(
         "SELECT task.taskId FROM HostRoleCommandEntity task " +

http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/java/org/apache/ambari/server/orm/entities/HostRoleCommandEntity.java
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/java/org/apache/ambari/server/orm/entities/HostRoleCommandEntity.java b/ambari-server/src/main/java/org/apache/ambari/server/orm/entities/HostRoleCommandEntity.java
index 19f0602..6288091 100644
--- a/ambari-server/src/main/java/org/apache/ambari/server/orm/entities/HostRoleCommandEntity.java
+++ b/ambari-server/src/main/java/org/apache/ambari/server/orm/entities/HostRoleCommandEntity.java
@@ -61,6 +61,7 @@ import org.apache.commons.lang.ArrayUtils;
 )
 @NamedQueries({
     @NamedQuery(name = "HostRoleCommandEntity.findCountByCommandStatuses", query = "SELECT COUNT(command.taskId) FROM HostRoleCommandEntity command WHERE command.status IN :statuses"),
+    @NamedQuery(name = "HostRoleCommandEntity.findByRequestIdAndStatuses", query="SELECT task FROM HostRoleCommandEntity task WHERE task.requestId=:requestId AND task.status IN :statuses ORDER BY task.taskId ASC"),
     @NamedQuery(name = "HostRoleCommandEntity.findTasksByStatusesOrderByIdDesc", query = "SELECT task FROM HostRoleCommandEntity task WHERE task.requestId = :requestId AND task.status IN :statuses ORDER BY task.taskId DESC"),
     @NamedQuery(name = "HostRoleCommandEntity.findNumTasksAlreadyRanInStage", query = "SELECT COUNT(task.taskId) FROM HostRoleCommandEntity task WHERE task.requestId = :requestId AND task.taskId > :taskId AND task.stageId > :stageId AND task.status NOT IN :statuses"),
     @NamedQuery(name = "HostRoleCommandEntity.findByCommandStatuses", query = "SELECT command FROM HostRoleCommandEntity command WHERE command.status IN :statuses ORDER BY command.requestId, command.stageId"),
@@ -136,6 +137,13 @@ public class HostRoleCommandEntity {
   @Column(name = "start_time", nullable = false)
   private Long startTime = -1L;
 
+  /**
+   * Because the startTime is allowed to be overwritten, introduced a new column for the original start time.
+   */
+  @Basic
+  @Column(name = "original_start_time", nullable = false)
+  private Long originalStartTime = -1L;
+
   @Basic
   @Column(name = "end_time", nullable = false)
   private Long endTime = -1L;
@@ -283,6 +291,22 @@ public class HostRoleCommandEntity {
     this.startTime = startTime;
   }
 
+  /**
+   * Get the original time the command was first scheduled on the agent. This value is never overwritten.
+   * @return Original start time
+   */
+  public Long getOriginalStartTime() {
+    return originalStartTime;
+  }
+
+  /**
+   * Set the original start time when the command is first scheduled. This value is never overwritten.
+   * @param originalStartTime Original start time
+   */
+  public void setOriginalStartTime(Long originalStartTime) {
+    this.originalStartTime = originalStartTime;
+  }
+
   public Long getLastAttemptTime() {
     return lastAttemptTime;
   }
@@ -421,6 +445,9 @@ public class HostRoleCommandEntity {
     if (startTime != null ? !startTime.equals(that.startTime) : that.startTime != null) {
       return false;
     }
+    if (originalStartTime != null ? !originalStartTime.equals(that.originalStartTime) : that.originalStartTime != null) {
+      return false;
+    }
     if (status != null ? !status.equals(that.status) : that.status != null) {
       return false;
     }
@@ -464,6 +491,7 @@ public class HostRoleCommandEntity {
     result = 31 * result + (outputLog != null ? outputLog.hashCode() : 0);
     result = 31 * result + (errorLog != null ? errorLog.hashCode() : 0);
     result = 31 * result + (startTime != null ? startTime.hashCode() : 0);
+    result = 31 * result + (originalStartTime != null ? originalStartTime.hashCode() : 0);
     result = 31 * result + (lastAttemptTime != null ? lastAttemptTime.hashCode() : 0);
     result = 31 * result + (attemptCount != null ? attemptCount.hashCode() : 0);
     result = 31 * result + (endTime != null ? endTime.hashCode() : 0);

http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/java/org/apache/ambari/server/state/services/RetryUpgradeActionService.java
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/java/org/apache/ambari/server/state/services/RetryUpgradeActionService.java b/ambari-server/src/main/java/org/apache/ambari/server/state/services/RetryUpgradeActionService.java
new file mode 100644
index 0000000..7d81cc4
--- /dev/null
+++ b/ambari-server/src/main/java/org/apache/ambari/server/state/services/RetryUpgradeActionService.java
@@ -0,0 +1,269 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.ambari.server.state.services;
+
+import com.google.common.util.concurrent.AbstractScheduledService;
+import com.google.inject.Inject;
+import com.google.inject.Injector;
+import com.google.inject.Provider;
+import com.google.inject.persist.Transactional;
+import org.apache.ambari.server.AmbariService;
+import org.apache.ambari.server.actionmanager.HostRoleStatus;
+import org.apache.ambari.server.configuration.Configuration;
+import org.apache.ambari.server.orm.dao.HostRoleCommandDAO;
+import org.apache.ambari.server.orm.entities.ClusterVersionEntity;
+import org.apache.ambari.server.orm.entities.HostRoleCommandEntity;
+import org.apache.ambari.server.orm.entities.UpgradeEntity;
+import org.apache.ambari.server.state.Cluster;
+import org.apache.ambari.server.state.Clusters;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.List;
+import java.util.Map;
+import java.util.TimeZone;
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Monitors commands during Stack Upgrade that are in a HOLDING_* failed because they failed in order to retry them
+ * automatically until they exceed a certain threshold of retry time.
+ */
+@AmbariService
+public class RetryUpgradeActionService extends AbstractScheduledService {
+
+  private final static Logger LOG = LoggerFactory.getLogger(RetryUpgradeActionService.class);
+
+  @Inject
+  private Injector m_injector;
+
+  @Inject
+  private Provider<Clusters> m_clustersProvider;
+
+  /**
+   * Configuration.
+   */
+  @Inject
+  private Configuration m_configuration;
+
+  @Inject
+  private HostRoleCommandDAO m_hostRoleCommandDAO;
+
+  private final List<HostRoleStatus> HOLDING_STATUSES = Arrays.asList(HostRoleStatus.HOLDING_FAILED, HostRoleStatus.HOLDING_TIMEDOUT);
+
+  private List<String> CUSTOM_COMMAND_NAMES_TO_IGNORE;
+  private List<String> COMMAND_DETAILS_TO_IGNORE;
+
+  /**
+   * Tasks will be retried up to this many minutes after their original start time.
+   */
+  private int MAX_TIMEOUT_MINS;
+  private Long MAX_TIMEOUT_MS;
+
+  /**
+   * Date formatter to print full dates
+   */
+  private DateFormat m_fullDateFormat;
+
+  /**
+   * Date formatter to print time deltas in HH:MM:ss
+   */
+  private SimpleDateFormat m_deltaDateFormat;
+
+
+  public RetryUpgradeActionService() {
+    this.m_fullDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    TimeZone tz = TimeZone.getTimeZone("UTC");
+    this.m_deltaDateFormat = new SimpleDateFormat("HH:mm:ss");
+    this.m_deltaDateFormat.setTimeZone(tz);
+  }
+
+  /**
+   * {@inheritDoc}
+   */
+  @Override
+  protected Scheduler scheduler() {
+    // Suggested every 10-60 secs.
+    int secs = m_configuration.getStackUpgradeAutoRetryCheckIntervalSecs();
+    return Scheduler.newFixedDelaySchedule(0, secs, TimeUnit.SECONDS);
+  }
+
+  /**
+   * {@inheritDoc}
+   * <p/>
+   * Only run if the timeout mins is positive.
+   */
+  @Override
+  protected void startUp() throws Exception {
+    this.MAX_TIMEOUT_MINS = m_configuration.getStackUpgradeAutoRetryTimeoutMins();
+    this.MAX_TIMEOUT_MS = MAX_TIMEOUT_MINS * 60000L;
+
+    if (this.MAX_TIMEOUT_MINS < 1) {
+      LOG.info("Will not start service {} used to auto-retry failed actions during " +
+          "Stack Upgrade since since the property {} is either invalid/missing or set to {}",
+          this.getClass().getSimpleName(), Configuration.STACK_UPGRADE_AUTO_RETRY_TIMEOUT_MINS_KEY, MAX_TIMEOUT_MINS);
+      stopAsync();
+    }
+
+    // During Stack Upgrade, some tasks don't make since to auto-retry since they are either
+    // running on the server, should only be ran multiple times with human intervention,
+    // or are not going to succeed on repeat attempts because they involve DB queries and not necessarily down hosts.
+    this.CUSTOM_COMMAND_NAMES_TO_IGNORE = m_configuration.getStackUpgradeAutoRetryCustomCommandNamesToIgnore();
+    this.COMMAND_DETAILS_TO_IGNORE = m_configuration.getStackUpgradeAutoRetryCommandDetailsToIgnore();
+  }
+
+  public void setMaxTimeout(int mins) {
+    this.MAX_TIMEOUT_MINS = mins;
+    this.MAX_TIMEOUT_MS = MAX_TIMEOUT_MINS * 60000L;
+  }
+
+  /**
+   * {@inheritDoc}
+   * <p/>
+   * Analyze each cluster for any active upgrades and attempt to retry any actions in a HOLDING_* status.
+   */
+  @Override
+  protected void runOneIteration() throws Exception {
+    Map<String, Cluster> clusterMap = m_clustersProvider.get().getClusters();
+    for (Cluster cluster : clusterMap.values()) {
+      try {
+        LOG.debug("Analyzing tasks for cluster {} that can be retried during Stack Upgrade.", cluster.getClusterName());
+        Long effectiveRequestId = getActiveUpgradeRequestId(cluster);
+        if (effectiveRequestId != null) {
+          LOG.debug("Upgrade is in-progress with request id {}.", effectiveRequestId);
+          retryHoldingCommandsInRequest(effectiveRequestId);
+        }
+      } catch (Exception e) {
+        LOG.error("Unable to analyze commands that may be retried for cluster with id {}. Exception: {}",
+            cluster.getClusterId(), e.getMessage());
+      }
+    }
+  }
+
+  /**
+   * Get the Request Id of the most recent stack upgrade if it is active.
+   * @param cluster Cluster
+   * @return Request Id of active stack upgrade.
+   */
+  private Long getActiveUpgradeRequestId(Cluster cluster) {
+    ClusterVersionEntity currentVersion = cluster.getCurrentClusterVersion();
+
+    if (currentVersion == null) {
+      LOG.debug("No Cluster Version exists as CURRENT. Skip retrying failed tasks.");
+      return null;
+    }
+
+    // May be null, and either upgrade or downgrade
+    UpgradeEntity currentUpgrade = cluster.getUpgradeEntity();
+    if (currentUpgrade == null) {
+      LOG.debug("There is no active stack upgrade in progress. Skip retrying failed tasks.");
+      return null;
+    }
+    LOG.debug("Found an active stack upgrade with id: {}, direction: {}, type: {}, from version: {}, to version: {}",
+        currentUpgrade.getId(), currentUpgrade.getDirection(), currentUpgrade.getUpgradeType(),
+        currentUpgrade.getFromVersion(), currentUpgrade.getToVersion());
+
+    return currentUpgrade.getRequestId();
+  }
+
+  /**
+   * Retry HOLDING_* tasks for the given Request Id if the tasks meet certain criteria.
+   * @param requestId Request Id to search tasks for.
+   */
+  @Transactional
+  private void retryHoldingCommandsInRequest(Long requestId) {
+    if (requestId == null) {
+      return;
+    }
+    long now = System.currentTimeMillis();
+
+    List<HostRoleCommandEntity> holdingCommands = m_hostRoleCommandDAO.findByRequestIdAndStatuses(requestId, HOLDING_STATUSES);
+    if (holdingCommands.size() > 0) {
+      for (HostRoleCommandEntity hrc : holdingCommands) {
+        LOG.debug("Comparing task id: {}, original start time: {}, now: {}",
+            hrc.getTaskId(), hrc.getOriginalStartTime(), now);
+
+        /*
+        While testing, can update the original_start_time of records in host_role_command table to current epoch time.
+        E.g. in postgres,
+        SELECT CAST(EXTRACT(EPOCH FROM NOW()) AS BIGINT) * 1000;
+        UPDATE host_role_command SET attempt_count = 1, status = 'HOLDING_FAILED', original_start_time = (CAST(EXTRACT(EPOCH FROM NOW()) AS BIGINT) * 1000) WHERE task_id IN (x, y, z);
+         */
+        if (canRetryCommand(hrc) && hrc.getOriginalStartTime() > 0 && hrc.getOriginalStartTime() < now) {
+          Long retryTimeWindow = hrc.getOriginalStartTime() + MAX_TIMEOUT_MS;
+          Long deltaMS = retryTimeWindow - now;
+
+          if (deltaMS > 0) {
+            String originalStartTimeString = this.m_fullDateFormat.format(new Date(hrc.getOriginalStartTime()));
+            String deltaString = this.m_deltaDateFormat.format(new Date(deltaMS));
+            LOG.info("Retrying task with id: {}, attempts: {}, original start time: {}, time til timeout: {}",
+                hrc.getTaskId(), hrc.getAttemptCount(), originalStartTimeString, deltaString);
+            retryHostRoleCommand(hrc);
+          }
+        }
+      }
+    }
+  }
+
+  /**
+   * Private method to determine if a Host Role Command can be retried.
+   * @param hrc Host Role Command entity.
+   * @return True if can be retried, otherwise false.
+   */
+  private boolean canRetryCommand(HostRoleCommandEntity hrc) {
+    if (hrc.isRetryAllowed() && !hrc.isFailureAutoSkipped()) {
+      // Important not to retry some steps during RU/EU like "Finalize Upgrade Pre-Check", "Execute HDFS Finalize", and "Save Cluster State".
+      // These elements are expected to be in lowercase already
+      if (null != hrc.getCustomCommandName()) {
+        for (String s : this.CUSTOM_COMMAND_NAMES_TO_IGNORE) {
+          if (hrc.getCustomCommandName().toLowerCase().contains(s)){
+            return false;
+          }
+        }
+      }
+      if (null != hrc.getCommandDetail()) {
+        for (String s : this.COMMAND_DETAILS_TO_IGNORE) {
+          if (hrc.getCommandDetail().toLowerCase().contains(s)) {
+            return false;
+          }
+        }
+      }
+      return true;
+    }
+    return false;
+  }
+
+  /**
+   * Private method to retry a Host Role Command by changing its status back to
+   * {@link org.apache.ambari.server.actionmanager.HostRoleStatus#PENDING} so
+   * that {@link org.apache.ambari.server.orm.DBAccessorImpl} can retry it.
+   * @param hrc Host Role Command entity
+   */
+  private void retryHostRoleCommand(HostRoleCommandEntity hrc) {
+    hrc.setStatus(HostRoleStatus.PENDING);
+    hrc.setStartTime(-1L);
+    // Don't change the original start time.
+    hrc.setLastAttemptTime(-1L);
+
+    // This will invalidate the cache, as expected.
+    m_hostRoleCommandDAO.merge(hrc);
+  }
+}

http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/java/org/apache/ambari/server/topology/HostRequest.java
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/java/org/apache/ambari/server/topology/HostRequest.java b/ambari-server/src/main/java/org/apache/ambari/server/topology/HostRequest.java
index 9eb514a..00ecb98 100644
--- a/ambari-server/src/main/java/org/apache/ambari/server/topology/HostRequest.java
+++ b/ambari-server/src/main/java/org/apache/ambari/server/topology/HostRequest.java
@@ -298,6 +298,7 @@ public class HostRequest implements Comparable<HostRequest> {
           logicalTask.setCustomCommandName(physicalTask.getCustomCommandName());
           //todo: once we retry on failures, start/end times could span multiple physical tasks
           logicalTask.setStartTime(physicalTask.getStartTime());
+          logicalTask.setOriginalStartTime(physicalTask.getOriginalStartTime());
           logicalTask.setEndTime(physicalTask.getEndTime());
           logicalTask.setErrorLog(physicalTask.getErrorLog());
           logicalTask.setExitCode(physicalTask.getExitCode());
@@ -343,6 +344,7 @@ public class HostRequest implements Comparable<HostRequest> {
           entity.setCustomCommandName(physicalTask.getCustomCommandName());
           //todo: once we retry on failures, start/end times could span multiple physical tasks
           entity.setStartTime(physicalTask.getStartTime());
+          entity.setOriginalStartTime(physicalTask.getOriginalStartTime());
           entity.setEndTime(physicalTask.getEndTime());
           entity.setErrorLog(physicalTask.getErrorLog());
           entity.setExitcode(physicalTask.getExitCode());

http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/java/org/apache/ambari/server/topology/LogicalRequest.java
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/java/org/apache/ambari/server/topology/LogicalRequest.java b/ambari-server/src/main/java/org/apache/ambari/server/topology/LogicalRequest.java
index 82edbcf..6de615a 100644
--- a/ambari-server/src/main/java/org/apache/ambari/server/topology/LogicalRequest.java
+++ b/ambari-server/src/main/java/org/apache/ambari/server/topology/LogicalRequest.java
@@ -47,7 +47,7 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 /**
- * Logical Request implementation.
+ * Logical Request implementation used to provision a cluster deployed by Blueprints.
  */
 public class LogicalRequest extends Request {
 

http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/java/org/apache/ambari/server/upgrade/UpgradeCatalog240.java
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/java/org/apache/ambari/server/upgrade/UpgradeCatalog240.java b/ambari-server/src/main/java/org/apache/ambari/server/upgrade/UpgradeCatalog240.java
index 7b83710..38a3614 100644
--- a/ambari-server/src/main/java/org/apache/ambari/server/upgrade/UpgradeCatalog240.java
+++ b/ambari-server/src/main/java/org/apache/ambari/server/upgrade/UpgradeCatalog240.java
@@ -69,6 +69,7 @@ public class UpgradeCatalog240 extends AbstractUpgradeCatalog {
   protected static final String PERMISSION_ID_COL = "permission_name";
   protected static final String SORT_ORDER_COL = "sort_order";
   protected static final String REPO_VERSION_TABLE = "repo_version";
+  protected static final String HOST_ROLE_COMMAND_TABLE = "host_role_command";
   protected static final String SERVICE_COMPONENT_DS_TABLE = "servicecomponentdesiredstate";
   protected static final String HOST_COMPONENT_DS_TABLE = "hostcomponentdesiredstate";
   protected static final String HOST_COMPONENT_STATE_TABLE = "hostcomponentstate";
@@ -637,4 +638,18 @@ public class UpgradeCatalog240 extends AbstractUpgradeCatalog {
     dbAccessor.addColumn(SERVICE_COMPONENT_DESIRED_STATE_TABLE,
       new DBColumnInfo(DESIRED_VERSION_COLUMN_NAME, String.class, 255, State.UNKNOWN.toString(), false));
   }
+
+  /**
+   * Alter host_role_command table to add original_start_time, which is needed because the start_time column now
+   * allows overriding the value in ActionScheduler.java
+   * @throws SQLException
+   */
+  private void updateHostRoleCommandTable() throws SQLException {
+    final String columnName = "original_start_time";
+    DBColumnInfo originalStartTimeColumn = new DBColumnInfo(columnName, Long.class, null, -1L, true);
+    dbAccessor.addColumn(HOST_ROLE_COMMAND_TABLE, originalStartTimeColumn);
+    dbAccessor.executeQuery("UPDATE " + HOST_ROLE_COMMAND_TABLE + " SET original_start_time = start_time", false);
+    dbAccessor.executeQuery("UPDATE " + HOST_ROLE_COMMAND_TABLE + " SET original_start_time=-1 WHERE original_start_time IS NULL");
+    dbAccessor.setColumnNullable(HOST_ROLE_COMMAND_TABLE, columnName, false);
+  }
 }

http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/resources/Ambari-DDL-MySQL-CREATE.sql
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/Ambari-DDL-MySQL-CREATE.sql b/ambari-server/src/main/resources/Ambari-DDL-MySQL-CREATE.sql
index e5a1fb1..b306c0a 100644
--- a/ambari-server/src/main/resources/Ambari-DDL-MySQL-CREATE.sql
+++ b/ambari-server/src/main/resources/Ambari-DDL-MySQL-CREATE.sql
@@ -239,6 +239,7 @@ CREATE TABLE host_role_command (
   role_command VARCHAR(255),
   stage_id BIGINT NOT NULL,
   start_time BIGINT NOT NULL,
+  original_start_time BIGINT NOT NULL,
   end_time BIGINT,
   status VARCHAR(255),
   auto_skip_on_failure SMALLINT DEFAULT 0 NOT NULL,

http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/resources/Ambari-DDL-Oracle-CREATE.sql
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/Ambari-DDL-Oracle-CREATE.sql b/ambari-server/src/main/resources/Ambari-DDL-Oracle-CREATE.sql
index 5df4de6..37744f8 100644
--- a/ambari-server/src/main/resources/Ambari-DDL-Oracle-CREATE.sql
+++ b/ambari-server/src/main/resources/Ambari-DDL-Oracle-CREATE.sql
@@ -229,6 +229,7 @@ CREATE TABLE host_role_command (
   role_command VARCHAR2(255) NULL,
   stage_id NUMBER(19) NOT NULL,
   start_time NUMBER(19) NOT NULL,
+  original_start_time NUMBER(19) NOT NULL,
   end_time NUMBER(19),
   status VARCHAR2(255) NULL,
   auto_skip_on_failure NUMBER(1) DEFAULT 0 NOT NULL,

http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/resources/Ambari-DDL-Postgres-CREATE.sql
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/Ambari-DDL-Postgres-CREATE.sql b/ambari-server/src/main/resources/Ambari-DDL-Postgres-CREATE.sql
index 9a213b5..eba1745 100644
--- a/ambari-server/src/main/resources/Ambari-DDL-Postgres-CREATE.sql
+++ b/ambari-server/src/main/resources/Ambari-DDL-Postgres-CREATE.sql
@@ -239,6 +239,7 @@ CREATE TABLE host_role_command (
   role VARCHAR(255),
   stage_id BIGINT NOT NULL,
   start_time BIGINT NOT NULL,
+  original_start_time BIGINT NOT NULL,
   end_time BIGINT,
   status VARCHAR(255),
   auto_skip_on_failure SMALLINT DEFAULT 0 NOT NULL,

http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/resources/Ambari-DDL-Postgres-EMBEDDED-CREATE.sql
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/Ambari-DDL-Postgres-EMBEDDED-CREATE.sql b/ambari-server/src/main/resources/Ambari-DDL-Postgres-EMBEDDED-CREATE.sql
index 53d4bf5..cad0a39 100644
--- a/ambari-server/src/main/resources/Ambari-DDL-Postgres-EMBEDDED-CREATE.sql
+++ b/ambari-server/src/main/resources/Ambari-DDL-Postgres-EMBEDDED-CREATE.sql
@@ -270,6 +270,7 @@ CREATE TABLE ambari.host_role_command (
   role VARCHAR(255),
   stage_id BIGINT NOT NULL,
   start_time BIGINT NOT NULL,
+  original_start_time BIGINT NOT NULL,
   end_time BIGINT,
   status VARCHAR(255),
   auto_skip_on_failure SMALLINT DEFAULT 0 NOT NULL,

http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/resources/Ambari-DDL-SQLAnywhere-CREATE.sql
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/Ambari-DDL-SQLAnywhere-CREATE.sql b/ambari-server/src/main/resources/Ambari-DDL-SQLAnywhere-CREATE.sql
index c4e5031..346af50 100644
--- a/ambari-server/src/main/resources/Ambari-DDL-SQLAnywhere-CREATE.sql
+++ b/ambari-server/src/main/resources/Ambari-DDL-SQLAnywhere-CREATE.sql
@@ -228,6 +228,7 @@ CREATE TABLE host_role_command (
   role_command VARCHAR(255),
   stage_id NUMERIC(19) NOT NULL,
   start_time NUMERIC(19) NOT NULL,
+  original_start_time NUMERIC(19) NOT NULL,
   end_time NUMERIC(19),
   status VARCHAR(255),
   auto_skip_on_failure SMALLINT DEFAULT 0 NOT NULL,

http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/resources/Ambari-DDL-SQLServer-CREATE.sql
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/Ambari-DDL-SQLServer-CREATE.sql b/ambari-server/src/main/resources/Ambari-DDL-SQLServer-CREATE.sql
index a50d21d..e238a76 100644
--- a/ambari-server/src/main/resources/Ambari-DDL-SQLServer-CREATE.sql
+++ b/ambari-server/src/main/resources/Ambari-DDL-SQLServer-CREATE.sql
@@ -250,6 +250,7 @@ CREATE TABLE host_role_command (
   role VARCHAR(255),
   stage_id BIGINT NOT NULL,
   start_time BIGINT NOT NULL,
+  original_start_time BIGINT NOT NULL,
   end_time BIGINT,
   status VARCHAR(255),
   auto_skip_on_failure SMALLINT DEFAULT 0 NOT NULL,

http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/test/java/org/apache/ambari/server/state/services/RetryUpgradeActionServiceTest.java
----------------------------------------------------------------------
diff --git a/ambari-server/src/test/java/org/apache/ambari/server/state/services/RetryUpgradeActionServiceTest.java b/ambari-server/src/test/java/org/apache/ambari/server/state/services/RetryUpgradeActionServiceTest.java
new file mode 100644
index 0000000..2fb57d7
--- /dev/null
+++ b/ambari-server/src/test/java/org/apache/ambari/server/state/services/RetryUpgradeActionServiceTest.java
@@ -0,0 +1,284 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.ambari.server.state.services;
+
+import com.google.inject.Guice;
+import com.google.inject.Injector;
+import org.apache.ambari.server.AmbariException;
+import org.apache.ambari.server.Role;
+import org.apache.ambari.server.RoleCommand;
+import org.apache.ambari.server.actionmanager.HostRoleStatus;
+import org.apache.ambari.server.orm.GuiceJpaInitializer;
+import org.apache.ambari.server.orm.InMemoryDefaultTestModule;
+import org.apache.ambari.server.orm.OrmTestHelper;
+import org.apache.ambari.server.orm.dao.HostRoleCommandDAO;
+import org.apache.ambari.server.orm.dao.RepositoryVersionDAO;
+import org.apache.ambari.server.orm.dao.RequestDAO;
+import org.apache.ambari.server.orm.dao.StackDAO;
+import org.apache.ambari.server.orm.dao.StageDAO;
+import org.apache.ambari.server.orm.dao.UpgradeDAO;
+import org.apache.ambari.server.orm.entities.HostRoleCommandEntity;
+import org.apache.ambari.server.orm.entities.RepositoryVersionEntity;
+import org.apache.ambari.server.orm.entities.RequestEntity;
+import org.apache.ambari.server.orm.entities.StackEntity;
+import org.apache.ambari.server.orm.entities.StageEntity;
+import org.apache.ambari.server.orm.entities.UpgradeEntity;
+import org.apache.ambari.server.state.Cluster;
+import org.apache.ambari.server.state.Clusters;
+import org.apache.ambari.server.state.RepositoryVersionState;
+import org.apache.ambari.server.state.StackId;
+import org.apache.ambari.server.state.stack.upgrade.Direction;
+import org.apache.ambari.server.state.stack.upgrade.UpgradeType;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+
+/**
+ * Tests {@link org.apache.ambari.server.state.services.RetryUpgradeActionService}.
+ */
+public class RetryUpgradeActionServiceTest {
+
+  private Injector injector;
+
+  private StackDAO stackDAO;
+  private Clusters clusters;
+  private RepositoryVersionDAO repoVersionDAO;
+  private UpgradeDAO upgradeDAO;
+  private RequestDAO requestDAO;
+  private StageDAO stageDAO;
+  private HostRoleCommandDAO hostRoleCommandDAO;
+  private OrmTestHelper helper;
+
+  // Instance variables shared by all tests
+  String clusterName = "c1";
+  Cluster cluster;
+  StackEntity stackEntity220;
+  StackId stack220;
+  Long upgradeRequestId = 1L;
+  Long stageId = 1L;
+
+  @Before
+  public void before() throws NoSuchFieldException, IllegalAccessException {
+    injector = Guice.createInjector(new InMemoryDefaultTestModule());
+    injector.getInstance(GuiceJpaInitializer.class);
+
+    stackDAO = injector.getInstance(StackDAO.class);
+    clusters = injector.getInstance(Clusters.class);
+    repoVersionDAO = injector.getInstance(RepositoryVersionDAO.class);
+    upgradeDAO = injector.getInstance(UpgradeDAO.class);
+    requestDAO = injector.getInstance(RequestDAO.class);
+    stageDAO = injector.getInstance(StageDAO.class);
+    hostRoleCommandDAO = injector.getInstance(HostRoleCommandDAO.class);
+    helper = injector.getInstance(OrmTestHelper.class);
+  }
+
+  /**
+   * Test the gauva service allows retrying certain failed actions during a stack upgrade.
+   * Case 1: No cluster => no-op
+   * Case 2: Cluster and valid timeout, but no active upgrade => no-op
+   * Case 3: Cluster with an active upgrade, but no HOLDING_FAILED|HOLDING_TIMEDOUT commands => no-op
+   * Case 4: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that
+   * does NOT meet conditions to be retried => no-op
+   * Case 5: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that
+   * DOES meet conditions to be retried => retries the task
+   * Case 6: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that
+   * was already retried and has now expired => no-op
+   * Case 7: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED, but it is a critical task
+   * during Finalize Cluster, which should not be retried => no-op
+   * @throws Exception
+   */
+  @Test
+  public void test() throws Exception {
+    int timeoutMins = 1;
+    RetryUpgradeActionService service = injector.getInstance(RetryUpgradeActionService.class);
+    service.startUp();
+
+    // Case 1: No cluster
+    service.runOneIteration();
+
+    // Case 2: Cluster and valid timeout, but no active upgrade
+    createCluster();
+    service.setMaxTimeout(timeoutMins);
+    service.runOneIteration();
+
+    // Case 3: Cluster with an active upgrade, but no HOLDING_FAILED|HOLDING_TIMEDOUT commands.
+    prepareUpgrade();
+
+    // Run the service
+    service.runOneIteration();
+
+    // Assert all commands in PENDING
+    List<HostRoleCommandEntity> commands = hostRoleCommandDAO.findAll();
+    Assert.assertTrue(!commands.isEmpty());
+    for (HostRoleCommandEntity hrc : commands) {
+      if (hrc.getStatus() == HostRoleStatus.PENDING) {
+        Assert.fail("Did not expect any HostRoleCommands to be PENDING");
+      }
+    }
+
+    // Case 4: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that does NOT meet conditions to be retried.
+    List<StageEntity> stages = stageDAO.findByStageIds(upgradeRequestId, new HashSet<Long>(){{ add(stageId); }});
+    Assert.assertTrue(!stages.isEmpty() && stages.size() == 1);
+    StageEntity stageEntity = stages.get(0);
+
+    HostRoleCommandEntity hrc2 = new HostRoleCommandEntity();
+    hrc2.setStage(stageEntity);
+    hrc2.setStatus(HostRoleStatus.HOLDING_FAILED);
+    hrc2.setRole(Role.ZOOKEEPER_SERVER);
+    hrc2.setRoleCommand(RoleCommand.RESTART);
+    hrc2.setRetryAllowed(false);
+    hrc2.setAutoSkipOnFailure(false);
+    stageEntity.getHostRoleCommands().add(hrc2);
+
+    hostRoleCommandDAO.create(hrc2);
+    stageDAO.merge(stageEntity);
+
+    // Run the service
+    service.runOneIteration();
+
+    commands = hostRoleCommandDAO.findAll();
+    Assert.assertTrue(!commands.isEmpty() && commands.size() == 2);
+    for (HostRoleCommandEntity hrc : commands) {
+      if (hrc.getStatus() == HostRoleStatus.PENDING) {
+        Assert.fail("Did not expect any HostRoleCommands to be PENDING");
+      }
+    }
+
+    // Case 5: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that DOES meet conditions to be retried.
+    long now = System.currentTimeMillis();
+    hrc2.setRetryAllowed(true);
+    hrc2.setOriginalStartTime(now);
+    hostRoleCommandDAO.merge(hrc2);
+
+    // Run the service
+    service.runOneIteration();
+
+    // Ensure that task 2 transitioned from HOLDING_FAILED to PENDING
+    Assert.assertEquals(HostRoleStatus.PENDING, hostRoleCommandDAO.findByPK(hrc2.getTaskId()).getStatus());
+
+    // Case 6: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that was already retried and has now expired.
+    now = System.currentTimeMillis();
+    hrc2.setOriginalStartTime(now - (timeoutMins * 60000) - 1);
+    hrc2.setStatus(HostRoleStatus.HOLDING_FAILED);
+    hostRoleCommandDAO.merge(hrc2);
+
+    // Run the service
+    service.runOneIteration();
+
+    Assert.assertEquals(HostRoleStatus.HOLDING_FAILED, hostRoleCommandDAO.findByPK(hrc2.getTaskId()).getStatus());
+
+    // Case 7: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED, but it is a critical task
+    // during Finalize Cluster, which should not be retried.
+    now = System.currentTimeMillis();
+    hrc2.setOriginalStartTime(now);
+    hrc2.setStatus(HostRoleStatus.HOLDING_FAILED);
+    hrc2.setCustomCommandName("org.apache.ambari.server.serveraction.upgrades.FinalizeUpgradeAction");
+    hostRoleCommandDAO.merge(hrc2);
+
+    // Run the service
+    service.runOneIteration();
+
+    Assert.assertEquals(HostRoleStatus.HOLDING_FAILED, hostRoleCommandDAO.findByPK(hrc2.getTaskId()).getStatus());
+  }
+
+  /**
+   * Create a cluster for stack HDP 2.2.0
+   * @throws AmbariException
+   */
+  private void createCluster() throws AmbariException {
+    stackEntity220 = stackDAO.find("HDP", "2.2.0");
+    stack220 = new StackId("HDP-2.2.0");
+
+    clusters.addCluster(clusterName, stack220);
+    cluster = clusters.getCluster("c1");
+
+    RepositoryVersionEntity repoVersionEntity = new RepositoryVersionEntity();
+    repoVersionEntity.setDisplayName("Initial Version");
+    repoVersionEntity.setOperatingSystems("");
+    repoVersionEntity.setStack(stackEntity220);
+    repoVersionEntity.setVersion("2.2.0.0");
+    repoVersionDAO.create(repoVersionEntity);
+
+    helper.getOrCreateRepositoryVersion(stack220, stack220.getStackVersion());
+
+    cluster.createClusterVersion(stack220, stack220.getStackVersion(), "admin", RepositoryVersionState.INSTALLING);
+    cluster.transitionClusterVersion(stack220, stack220.getStackVersion(), RepositoryVersionState.CURRENT);
+  }
+
+  /**
+   * Create a new repo version, plus the request and stage objects needed for a ROLLING stack upgrade.
+   * @throws AmbariException
+   */
+  private void prepareUpgrade() throws AmbariException {
+    RepositoryVersionEntity repoVersionEntity = new RepositoryVersionEntity();
+    repoVersionEntity.setDisplayName("Version to Upgrade To");
+    repoVersionEntity.setOperatingSystems("");
+    repoVersionEntity.setStack(stackEntity220);
+    repoVersionEntity.setVersion("2.2.0.1");
+    repoVersionDAO.create(repoVersionEntity);
+
+    helper.getOrCreateRepositoryVersion(stack220, stack220.getStackVersion());
+
+    RequestEntity requestEntity = new RequestEntity();
+    requestEntity.setRequestId(upgradeRequestId);
+    requestEntity.setClusterId(cluster.getClusterId());
+    requestEntity.setStatus(HostRoleStatus.PENDING);
+    requestDAO.create(requestEntity);
+
+    // Create the stage and add it to the request
+    StageEntity stageEntity = new StageEntity();
+    stageEntity.setRequest(requestEntity);
+    stageEntity.setClusterId(cluster.getClusterId());
+    stageEntity.setRequestId(upgradeRequestId);
+    stageEntity.setStageId(stageId);
+    requestEntity.setStages(Collections.singletonList(stageEntity));
+    stageDAO.create(stageEntity);
+    requestDAO.merge(requestEntity);
+
+    UpgradeEntity upgrade = new UpgradeEntity();
+    upgrade.setId(1L);
+    upgrade.setRequestId(upgradeRequestId);
+    upgrade.setClusterId(cluster.getClusterId());
+    upgrade.setUpgradePackage("some-name");
+    upgrade.setUpgradeType(UpgradeType.ROLLING);
+    upgrade.setDirection(Direction.UPGRADE);
+    upgrade.setFromVersion("2.2.0.0");
+    upgrade.setToVersion("2.2.0.1");
+    upgradeDAO.create(upgrade);
+
+    cluster.setUpgradeEntity(upgrade);
+
+    // Create the task and add it to the stage
+    HostRoleCommandEntity hrc1 = new HostRoleCommandEntity();
+
+    hrc1.setStage(stageEntity);
+    hrc1.setStatus(HostRoleStatus.COMPLETED);
+    hrc1.setRole(Role.ZOOKEEPER_SERVER);
+    hrc1.setRoleCommand(RoleCommand.RESTART);
+
+    stageEntity.setHostRoleCommands(new ArrayList<HostRoleCommandEntity>());
+    stageEntity.getHostRoleCommands().add(hrc1);
+    hostRoleCommandDAO.create(hrc1);
+    stageDAO.merge(stageEntity);
+  }
+}


Mime
View raw message