Return-Path: X-Original-To: apmail-ambari-commits-archive@www.apache.org Delivered-To: apmail-ambari-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 8C8DA19221 for ; Mon, 21 Mar 2016 21:00:30 +0000 (UTC) Received: (qmail 26818 invoked by uid 500); 21 Mar 2016 21:00:30 -0000 Delivered-To: apmail-ambari-commits-archive@ambari.apache.org Received: (qmail 26786 invoked by uid 500); 21 Mar 2016 21:00:30 -0000 Mailing-List: contact commits-help@ambari.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: ambari-dev@ambari.apache.org Delivered-To: mailing list commits@ambari.apache.org Received: (qmail 26777 invoked by uid 99); 21 Mar 2016 21:00:30 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 21 Mar 2016 21:00:30 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 4079FDFA6C; Mon, 21 Mar 2016 21:00:30 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: alejandro@apache.org To: commits@ambari.apache.org Message-Id: X-Mailer: ASF-Git Admin Mailer Subject: ambari git commit: AMBARI-15446. Auto-retry on failure during RU/EU (alejandro) Date: Mon, 21 Mar 2016 21:00:30 +0000 (UTC) Repository: ambari Updated Branches: refs/heads/trunk 0521787d8 -> 79b9e570f AMBARI-15446. Auto-retry on failure during RU/EU (alejandro) Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/79b9e570 Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/79b9e570 Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/79b9e570 Branch: refs/heads/trunk Commit: 79b9e570fe1549bfc312dc02c29246c56043face Parents: 0521787 Author: Alejandro Fernandez Authored: Mon Mar 21 11:54:39 2016 -0700 Committer: Alejandro Fernandez Committed: Mon Mar 21 14:00:19 2016 -0700 ---------------------------------------------------------------------- .../actionmanager/ActionDBAccessorImpl.java | 5 + .../server/actionmanager/HostRoleCommand.java | 12 + .../ambari/server/agent/HeartbeatProcessor.java | 5 + .../server/configuration/Configuration.java | 109 +++++++ .../server/orm/dao/HostRoleCommandDAO.java | 10 + .../orm/entities/HostRoleCommandEntity.java | 28 ++ .../services/RetryUpgradeActionService.java | 269 ++++++++++++++++++ .../ambari/server/topology/HostRequest.java | 2 + .../ambari/server/topology/LogicalRequest.java | 2 +- .../server/upgrade/UpgradeCatalog240.java | 15 + .../main/resources/Ambari-DDL-MySQL-CREATE.sql | 1 + .../main/resources/Ambari-DDL-Oracle-CREATE.sql | 1 + .../resources/Ambari-DDL-Postgres-CREATE.sql | 1 + .../Ambari-DDL-Postgres-EMBEDDED-CREATE.sql | 1 + .../resources/Ambari-DDL-SQLAnywhere-CREATE.sql | 1 + .../resources/Ambari-DDL-SQLServer-CREATE.sql | 1 + .../services/RetryUpgradeActionServiceTest.java | 284 +++++++++++++++++++ 17 files changed, 746 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionDBAccessorImpl.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionDBAccessorImpl.java b/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionDBAccessorImpl.java index 429f573..a75a000 100644 --- a/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionDBAccessorImpl.java +++ b/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionDBAccessorImpl.java @@ -601,6 +601,9 @@ public class ActionDBAccessorImpl implements ActionDBAccessor { HostRoleCommandEntity entity = hostRoleCommandDAO.findByPK(hostRoleCommand.getTaskId()); if (entity != null) { entity.setStartTime(hostRoleCommand.getStartTime()); + if (entity.getOriginalStartTime() == null || entity.getOriginalStartTime() == -1) { + entity.setOriginalStartTime(System.currentTimeMillis()); + } entity.setLastAttemptTime(hostRoleCommand.getLastAttemptTime()); entity.setStatus(hostRoleCommand.getStatus()); entity.setAttemptCount(hostRoleCommand.getAttemptCount()); @@ -738,6 +741,8 @@ public class ActionDBAccessorImpl implements ActionDBAccessor { List tasks = hostRoleCommandDAO.findByPKs(taskIds); for (HostRoleCommandEntity task : tasks) { task.setStatus(HostRoleStatus.PENDING); + // TODO HACK, shouldn't reset start time. + // Because it expects -1, RetryActionMonitor.java also had to set it to -1. task.setStartTime(-1L); task.setEndTime(-1L); } http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/HostRoleCommand.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/HostRoleCommand.java b/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/HostRoleCommand.java index 2764b3f..83fa6b9 100644 --- a/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/HostRoleCommand.java +++ b/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/HostRoleCommand.java @@ -58,6 +58,7 @@ public class HostRoleCommand { private String structuredOut = ""; private int exitCode = 999; //Default is unknown private long startTime = -1; + private long originalStartTime = -1; private long endTime = -1; private long lastAttemptTime = -1; private short attemptCount = 0; @@ -160,6 +161,7 @@ public class HostRoleCommand { structuredOut = hostRoleCommandEntity.getStructuredOut() != null ? new String(hostRoleCommandEntity.getStructuredOut()) : ""; exitCode = hostRoleCommandEntity.getExitcode(); startTime = hostRoleCommandEntity.getStartTime(); + originalStartTime = hostRoleCommandEntity.getOriginalStartTime(); endTime = hostRoleCommandEntity.getEndTime() != null ? hostRoleCommandEntity.getEndTime() : -1L; lastAttemptTime = hostRoleCommandEntity.getLastAttemptTime(); attemptCount = hostRoleCommandEntity.getAttemptCount(); @@ -182,6 +184,7 @@ public class HostRoleCommand { hostRoleCommandEntity.setStdOut(stdout.getBytes()); hostRoleCommandEntity.setStructuredOut(structuredOut.getBytes()); hostRoleCommandEntity.setStartTime(startTime); + hostRoleCommandEntity.setOriginalStartTime(originalStartTime); hostRoleCommandEntity.setEndTime(endTime); hostRoleCommandEntity.setLastAttemptTime(lastAttemptTime); hostRoleCommandEntity.setAttemptCount(attemptCount); @@ -328,6 +331,14 @@ public class HostRoleCommand { this.startTime = startTime; } + public long getOriginalStartTime() { + return originalStartTime; + } + + public void setOriginalStartTime(long originalStartTime) { + this.originalStartTime = originalStartTime; + } + public long getLastAttemptTime() { return lastAttemptTime; } @@ -429,6 +440,7 @@ public class HostRoleCommand { builder.append(" stderr: ").append(stderr).append("\n"); builder.append(" exitcode: ").append(exitCode).append("\n"); builder.append(" Start time: ").append(startTime).append("\n"); + builder.append(" Original Start time: ").append(originalStartTime).append("\n"); builder.append(" Last attempt time: ").append(lastAttemptTime).append("\n"); builder.append(" attempt count: ").append(attemptCount).append("\n"); return builder.toString(); http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartbeatProcessor.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartbeatProcessor.java b/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartbeatProcessor.java index a1a686a..7e57031 100644 --- a/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartbeatProcessor.java +++ b/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartbeatProcessor.java @@ -403,6 +403,11 @@ public class HeartbeatProcessor extends AbstractService{ if (hostRoleCommand.getStatus() == HostRoleStatus.QUEUED && report.getStatus().equals("IN_PROGRESS")) { hostRoleCommand.setStartTime(now); + + // Because the task may be retried several times, set the original start time only once. + if (hostRoleCommand.getOriginalStartTime() == -1) { + hostRoleCommand.setOriginalStartTime(now); + } } // If the report indicates the keytab file was successfully transferred to a host or removed http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/java/org/apache/ambari/server/configuration/Configuration.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/java/org/apache/ambari/server/configuration/Configuration.java b/ambari-server/src/main/java/org/apache/ambari/server/configuration/Configuration.java index 9404506..bf18325 100644 --- a/ambari-server/src/main/java/org/apache/ambari/server/configuration/Configuration.java +++ b/ambari-server/src/main/java/org/apache/ambari/server/configuration/Configuration.java @@ -39,6 +39,7 @@ import org.apache.ambari.server.state.stack.OsFamily; import org.apache.ambari.server.utils.AmbariPath; import org.apache.ambari.server.utils.Parallel; import org.apache.ambari.server.utils.ShellCommandUtil; +import org.apache.commons.collections.ListUtils; import org.apache.commons.io.FileUtils; import org.apache.commons.lang.RandomStringUtils; import org.apache.commons.lang.StringUtils; @@ -210,9 +211,41 @@ public class Configuration { // Properties for stack upgrade (Rolling, Express) public static final String ROLLING_UPGRADE_SKIP_PACKAGES_PREFIXES_KEY = "rolling.upgrade.skip.packages.prefixes"; public static final String ROLLING_UPGRADE_SKIP_PACKAGES_PREFIXES_DEFAULT = ""; + public static final String STACK_UPGRADE_BYPASS_PRECHECKS_KEY = "stack.upgrade.bypass.prechecks"; public static final String STACK_UPGRADE_BYPASS_PRECHECKS_DEFAULT = "false"; + /** + * If a host is shutdown or ambari-agent is stopped, then Ambari Server will still keep waiting til the task timesout, + * say 10-20 mins. If the host comes back online and ambari-agent is started, then need this retry property + * to be greater; ideally, it should be greater than 2 * command_timeout in order to retry at least + * 3 times in that amount of mins. + * Suggested value is 15-30 mins. + */ + public static final String STACK_UPGRADE_AUTO_RETRY_TIMEOUT_MINS_KEY = "stack.upgrade.auto.retry.timeout.mins"; + public static final String STACK_UPGRADE_AUTO_RETRY_TIMEOUT_MINS_DEFAULT = "0"; + + /** + * If the stack.upgrade.auto.retry.timeout.mins property is positive, then run RetryUpgradeActionService every x + * seconds. + */ + public static final String STACK_UPGRADE_AUTO_RETRY_CHECK_INTERVAL_SECS_KEY = "stack.upgrade.auto.retry.check.interval.secs"; + public static final String STACK_UPGRADE_AUTO_RETRY_CHECK_INTERVAL_SECS_DEFAULT = "20"; + + /** + * If auto-retry during stack upgrade is enabled, skip any tasks whose custom command name contains at least one + * of the strings in the following CSV property. Note that values have to be enclosed in quotes and separated by commas. + */ + public static final String STACK_UPGRADE_AUTO_RETRY_CUSTOM_COMMAND_NAMES_TO_IGNORE_KEY = "stack.upgrade.auto.retry.command.names.to.ignore"; + public static final String STACK_UPGRADE_AUTO_RETRY_CUSTOM_COMMAND_NAMES_TO_IGNORE_DEFAULT = "\"ComponentVersionCheckAction\",\"FinalizeUpgradeAction\""; + + /** + * If auto-retry during stack upgrade is enabled, skip any tasks whose command details contains at least one + * of the strings in the following CSV property. Note that values have to be enclosed in quotes and separated by commas. + */ + public static final String STACK_UPGRADE_AUTO_RETRY_COMMAND_DETAILS_TO_IGNORE_KEY = "stack.upgrade.auto.retry.command.details.to.ignore"; + public static final String STACK_UPGRADE_AUTO_RETRY_COMMAND_DETAILS_TO_IGNORE_DEFAULT = "\"Execute HDFS Finalize\""; + public static final String JWT_AUTH_ENBABLED = "authentication.jwt.enabled"; public static final String JWT_AUTH_PROVIDER_URL = "authentication.jwt.providerUrl"; public static final String JWT_PUBLIC_KEY = "authentication.jwt.publicKey"; @@ -1110,6 +1143,82 @@ public class Configuration { } /** + * During stack upgrade, can auto-retry failures for up to x mins. This is useful to improve the robustness in unstable environments. + * Suggested value is 0-30 mins. + * @return + */ + public int getStackUpgradeAutoRetryTimeoutMins() { + Integer result = NumberUtils.toInt(properties.getProperty(STACK_UPGRADE_AUTO_RETRY_TIMEOUT_MINS_KEY, STACK_UPGRADE_AUTO_RETRY_TIMEOUT_MINS_DEFAULT)); + return result >= 0 ? result : 0; + } + + /** + * If the stack.upgrade.auto.retry.timeout.mins property is positive, then run RetryUpgradeActionService every x + * seconds. + * @return Number of seconds between runs of {@link org.apache.ambari.server.state.services.RetryUpgradeActionService} + */ + public int getStackUpgradeAutoRetryCheckIntervalSecs() { + Integer result = NumberUtils.toInt(properties.getProperty(STACK_UPGRADE_AUTO_RETRY_CHECK_INTERVAL_SECS_KEY, STACK_UPGRADE_AUTO_RETRY_CHECK_INTERVAL_SECS_DEFAULT)); + return result >= 0 ? result : 0; + } + + /** + * If auto-retry during stack upgrade is enabled, skip any tasks whose custom command name contains at least one + * of the strings in the following CSV property. Note that values have to be enclosed in quotes and separated by commas. + * @return + */ + public List getStackUpgradeAutoRetryCustomCommandNamesToIgnore() { + String value = properties.getProperty(STACK_UPGRADE_AUTO_RETRY_CUSTOM_COMMAND_NAMES_TO_IGNORE_KEY, STACK_UPGRADE_AUTO_RETRY_CUSTOM_COMMAND_NAMES_TO_IGNORE_DEFAULT); + List list = convertCSVwithQuotesToList(value); + listToLowerCase(list); + return list; + } + + /** + * If auto-retry during stack upgrade is enabled, skip any tasks whose command details contains at least one + * of the strings in the following CSV property. Note that values have to be enclosed in quotes and separated by commas. + * @return + */ + public List getStackUpgradeAutoRetryCommandDetailsToIgnore() { + String value = properties.getProperty(STACK_UPGRADE_AUTO_RETRY_COMMAND_DETAILS_TO_IGNORE_KEY, STACK_UPGRADE_AUTO_RETRY_COMMAND_DETAILS_TO_IGNORE_DEFAULT); + List list = convertCSVwithQuotesToList(value); + listToLowerCase(list); + return list; + } + + /** + * Convert quoted elements separated by commas into a list. Values cannot contain double quotes or commas. + * @param value, e.g., String with value "a","b","c" => ["a", "b", "c"] + * @return List of parsed values, or empty list if no values exist. + */ + private List convertCSVwithQuotesToList(String value) { + List list = new ArrayList<>(); + if (StringUtils.isNotEmpty(value)) { + if (value.indexOf(",") >= 0) { + for (String e : value.split(",")) { + e = StringUtils.stripStart(e, "\""); + e = StringUtils.stripEnd(e, "\""); + list.add(e); + } + } else { + list.add(value); + } + } + return list; + } + + /** + * Convert the elements of a list to lowercase. + * @param list + */ + private void listToLowerCase(List list) { + if (list == null) return; + for (int i = 0; i < list.size(); i++) { + list.set(i, list.get(i).toLowerCase()); + } + } + + /** * Get the map with server config parameters. * Keys - public constants of this class * @return the map with server config parameters http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/java/org/apache/ambari/server/orm/dao/HostRoleCommandDAO.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/java/org/apache/ambari/server/orm/dao/HostRoleCommandDAO.java b/ambari-server/src/main/java/org/apache/ambari/server/orm/dao/HostRoleCommandDAO.java index f5b1cb4..2e35001 100644 --- a/ambari-server/src/main/java/org/apache/ambari/server/orm/dao/HostRoleCommandDAO.java +++ b/ambari-server/src/main/java/org/apache/ambari/server/orm/dao/HostRoleCommandDAO.java @@ -313,6 +313,16 @@ public class HostRoleCommandDAO { } @RequiresSession + public List findByRequestIdAndStatuses(Long requestId, Collection statuses) { + TypedQuery query = entityManagerProvider.get().createNamedQuery( + "HostRoleCommandEntity.findByRequestIdAndStatuses", HostRoleCommandEntity.class); + query.setParameter("requestId", requestId); + query.setParameter("statuses", statuses); + List results = query.getResultList(); + return results; + } + + @RequiresSession public List findTaskIdsByRequestIds(Collection requestIds) { TypedQuery query = entityManagerProvider.get().createQuery( "SELECT task.taskId FROM HostRoleCommandEntity task " + http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/java/org/apache/ambari/server/orm/entities/HostRoleCommandEntity.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/java/org/apache/ambari/server/orm/entities/HostRoleCommandEntity.java b/ambari-server/src/main/java/org/apache/ambari/server/orm/entities/HostRoleCommandEntity.java index 19f0602..6288091 100644 --- a/ambari-server/src/main/java/org/apache/ambari/server/orm/entities/HostRoleCommandEntity.java +++ b/ambari-server/src/main/java/org/apache/ambari/server/orm/entities/HostRoleCommandEntity.java @@ -61,6 +61,7 @@ import org.apache.commons.lang.ArrayUtils; ) @NamedQueries({ @NamedQuery(name = "HostRoleCommandEntity.findCountByCommandStatuses", query = "SELECT COUNT(command.taskId) FROM HostRoleCommandEntity command WHERE command.status IN :statuses"), + @NamedQuery(name = "HostRoleCommandEntity.findByRequestIdAndStatuses", query="SELECT task FROM HostRoleCommandEntity task WHERE task.requestId=:requestId AND task.status IN :statuses ORDER BY task.taskId ASC"), @NamedQuery(name = "HostRoleCommandEntity.findTasksByStatusesOrderByIdDesc", query = "SELECT task FROM HostRoleCommandEntity task WHERE task.requestId = :requestId AND task.status IN :statuses ORDER BY task.taskId DESC"), @NamedQuery(name = "HostRoleCommandEntity.findNumTasksAlreadyRanInStage", query = "SELECT COUNT(task.taskId) FROM HostRoleCommandEntity task WHERE task.requestId = :requestId AND task.taskId > :taskId AND task.stageId > :stageId AND task.status NOT IN :statuses"), @NamedQuery(name = "HostRoleCommandEntity.findByCommandStatuses", query = "SELECT command FROM HostRoleCommandEntity command WHERE command.status IN :statuses ORDER BY command.requestId, command.stageId"), @@ -136,6 +137,13 @@ public class HostRoleCommandEntity { @Column(name = "start_time", nullable = false) private Long startTime = -1L; + /** + * Because the startTime is allowed to be overwritten, introduced a new column for the original start time. + */ + @Basic + @Column(name = "original_start_time", nullable = false) + private Long originalStartTime = -1L; + @Basic @Column(name = "end_time", nullable = false) private Long endTime = -1L; @@ -283,6 +291,22 @@ public class HostRoleCommandEntity { this.startTime = startTime; } + /** + * Get the original time the command was first scheduled on the agent. This value is never overwritten. + * @return Original start time + */ + public Long getOriginalStartTime() { + return originalStartTime; + } + + /** + * Set the original start time when the command is first scheduled. This value is never overwritten. + * @param originalStartTime Original start time + */ + public void setOriginalStartTime(Long originalStartTime) { + this.originalStartTime = originalStartTime; + } + public Long getLastAttemptTime() { return lastAttemptTime; } @@ -421,6 +445,9 @@ public class HostRoleCommandEntity { if (startTime != null ? !startTime.equals(that.startTime) : that.startTime != null) { return false; } + if (originalStartTime != null ? !originalStartTime.equals(that.originalStartTime) : that.originalStartTime != null) { + return false; + } if (status != null ? !status.equals(that.status) : that.status != null) { return false; } @@ -464,6 +491,7 @@ public class HostRoleCommandEntity { result = 31 * result + (outputLog != null ? outputLog.hashCode() : 0); result = 31 * result + (errorLog != null ? errorLog.hashCode() : 0); result = 31 * result + (startTime != null ? startTime.hashCode() : 0); + result = 31 * result + (originalStartTime != null ? originalStartTime.hashCode() : 0); result = 31 * result + (lastAttemptTime != null ? lastAttemptTime.hashCode() : 0); result = 31 * result + (attemptCount != null ? attemptCount.hashCode() : 0); result = 31 * result + (endTime != null ? endTime.hashCode() : 0); http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/java/org/apache/ambari/server/state/services/RetryUpgradeActionService.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/java/org/apache/ambari/server/state/services/RetryUpgradeActionService.java b/ambari-server/src/main/java/org/apache/ambari/server/state/services/RetryUpgradeActionService.java new file mode 100644 index 0000000..7d81cc4 --- /dev/null +++ b/ambari-server/src/main/java/org/apache/ambari/server/state/services/RetryUpgradeActionService.java @@ -0,0 +1,269 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.ambari.server.state.services; + +import com.google.common.util.concurrent.AbstractScheduledService; +import com.google.inject.Inject; +import com.google.inject.Injector; +import com.google.inject.Provider; +import com.google.inject.persist.Transactional; +import org.apache.ambari.server.AmbariService; +import org.apache.ambari.server.actionmanager.HostRoleStatus; +import org.apache.ambari.server.configuration.Configuration; +import org.apache.ambari.server.orm.dao.HostRoleCommandDAO; +import org.apache.ambari.server.orm.entities.ClusterVersionEntity; +import org.apache.ambari.server.orm.entities.HostRoleCommandEntity; +import org.apache.ambari.server.orm.entities.UpgradeEntity; +import org.apache.ambari.server.state.Cluster; +import org.apache.ambari.server.state.Clusters; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.Arrays; +import java.util.Date; +import java.util.List; +import java.util.Map; +import java.util.TimeZone; +import java.util.concurrent.TimeUnit; + +/** + * Monitors commands during Stack Upgrade that are in a HOLDING_* failed because they failed in order to retry them + * automatically until they exceed a certain threshold of retry time. + */ +@AmbariService +public class RetryUpgradeActionService extends AbstractScheduledService { + + private final static Logger LOG = LoggerFactory.getLogger(RetryUpgradeActionService.class); + + @Inject + private Injector m_injector; + + @Inject + private Provider m_clustersProvider; + + /** + * Configuration. + */ + @Inject + private Configuration m_configuration; + + @Inject + private HostRoleCommandDAO m_hostRoleCommandDAO; + + private final List HOLDING_STATUSES = Arrays.asList(HostRoleStatus.HOLDING_FAILED, HostRoleStatus.HOLDING_TIMEDOUT); + + private List CUSTOM_COMMAND_NAMES_TO_IGNORE; + private List COMMAND_DETAILS_TO_IGNORE; + + /** + * Tasks will be retried up to this many minutes after their original start time. + */ + private int MAX_TIMEOUT_MINS; + private Long MAX_TIMEOUT_MS; + + /** + * Date formatter to print full dates + */ + private DateFormat m_fullDateFormat; + + /** + * Date formatter to print time deltas in HH:MM:ss + */ + private SimpleDateFormat m_deltaDateFormat; + + + public RetryUpgradeActionService() { + this.m_fullDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + TimeZone tz = TimeZone.getTimeZone("UTC"); + this.m_deltaDateFormat = new SimpleDateFormat("HH:mm:ss"); + this.m_deltaDateFormat.setTimeZone(tz); + } + + /** + * {@inheritDoc} + */ + @Override + protected Scheduler scheduler() { + // Suggested every 10-60 secs. + int secs = m_configuration.getStackUpgradeAutoRetryCheckIntervalSecs(); + return Scheduler.newFixedDelaySchedule(0, secs, TimeUnit.SECONDS); + } + + /** + * {@inheritDoc} + *

+ * Only run if the timeout mins is positive. + */ + @Override + protected void startUp() throws Exception { + this.MAX_TIMEOUT_MINS = m_configuration.getStackUpgradeAutoRetryTimeoutMins(); + this.MAX_TIMEOUT_MS = MAX_TIMEOUT_MINS * 60000L; + + if (this.MAX_TIMEOUT_MINS < 1) { + LOG.info("Will not start service {} used to auto-retry failed actions during " + + "Stack Upgrade since since the property {} is either invalid/missing or set to {}", + this.getClass().getSimpleName(), Configuration.STACK_UPGRADE_AUTO_RETRY_TIMEOUT_MINS_KEY, MAX_TIMEOUT_MINS); + stopAsync(); + } + + // During Stack Upgrade, some tasks don't make since to auto-retry since they are either + // running on the server, should only be ran multiple times with human intervention, + // or are not going to succeed on repeat attempts because they involve DB queries and not necessarily down hosts. + this.CUSTOM_COMMAND_NAMES_TO_IGNORE = m_configuration.getStackUpgradeAutoRetryCustomCommandNamesToIgnore(); + this.COMMAND_DETAILS_TO_IGNORE = m_configuration.getStackUpgradeAutoRetryCommandDetailsToIgnore(); + } + + public void setMaxTimeout(int mins) { + this.MAX_TIMEOUT_MINS = mins; + this.MAX_TIMEOUT_MS = MAX_TIMEOUT_MINS * 60000L; + } + + /** + * {@inheritDoc} + *

+ * Analyze each cluster for any active upgrades and attempt to retry any actions in a HOLDING_* status. + */ + @Override + protected void runOneIteration() throws Exception { + Map clusterMap = m_clustersProvider.get().getClusters(); + for (Cluster cluster : clusterMap.values()) { + try { + LOG.debug("Analyzing tasks for cluster {} that can be retried during Stack Upgrade.", cluster.getClusterName()); + Long effectiveRequestId = getActiveUpgradeRequestId(cluster); + if (effectiveRequestId != null) { + LOG.debug("Upgrade is in-progress with request id {}.", effectiveRequestId); + retryHoldingCommandsInRequest(effectiveRequestId); + } + } catch (Exception e) { + LOG.error("Unable to analyze commands that may be retried for cluster with id {}. Exception: {}", + cluster.getClusterId(), e.getMessage()); + } + } + } + + /** + * Get the Request Id of the most recent stack upgrade if it is active. + * @param cluster Cluster + * @return Request Id of active stack upgrade. + */ + private Long getActiveUpgradeRequestId(Cluster cluster) { + ClusterVersionEntity currentVersion = cluster.getCurrentClusterVersion(); + + if (currentVersion == null) { + LOG.debug("No Cluster Version exists as CURRENT. Skip retrying failed tasks."); + return null; + } + + // May be null, and either upgrade or downgrade + UpgradeEntity currentUpgrade = cluster.getUpgradeEntity(); + if (currentUpgrade == null) { + LOG.debug("There is no active stack upgrade in progress. Skip retrying failed tasks."); + return null; + } + LOG.debug("Found an active stack upgrade with id: {}, direction: {}, type: {}, from version: {}, to version: {}", + currentUpgrade.getId(), currentUpgrade.getDirection(), currentUpgrade.getUpgradeType(), + currentUpgrade.getFromVersion(), currentUpgrade.getToVersion()); + + return currentUpgrade.getRequestId(); + } + + /** + * Retry HOLDING_* tasks for the given Request Id if the tasks meet certain criteria. + * @param requestId Request Id to search tasks for. + */ + @Transactional + private void retryHoldingCommandsInRequest(Long requestId) { + if (requestId == null) { + return; + } + long now = System.currentTimeMillis(); + + List holdingCommands = m_hostRoleCommandDAO.findByRequestIdAndStatuses(requestId, HOLDING_STATUSES); + if (holdingCommands.size() > 0) { + for (HostRoleCommandEntity hrc : holdingCommands) { + LOG.debug("Comparing task id: {}, original start time: {}, now: {}", + hrc.getTaskId(), hrc.getOriginalStartTime(), now); + + /* + While testing, can update the original_start_time of records in host_role_command table to current epoch time. + E.g. in postgres, + SELECT CAST(EXTRACT(EPOCH FROM NOW()) AS BIGINT) * 1000; + UPDATE host_role_command SET attempt_count = 1, status = 'HOLDING_FAILED', original_start_time = (CAST(EXTRACT(EPOCH FROM NOW()) AS BIGINT) * 1000) WHERE task_id IN (x, y, z); + */ + if (canRetryCommand(hrc) && hrc.getOriginalStartTime() > 0 && hrc.getOriginalStartTime() < now) { + Long retryTimeWindow = hrc.getOriginalStartTime() + MAX_TIMEOUT_MS; + Long deltaMS = retryTimeWindow - now; + + if (deltaMS > 0) { + String originalStartTimeString = this.m_fullDateFormat.format(new Date(hrc.getOriginalStartTime())); + String deltaString = this.m_deltaDateFormat.format(new Date(deltaMS)); + LOG.info("Retrying task with id: {}, attempts: {}, original start time: {}, time til timeout: {}", + hrc.getTaskId(), hrc.getAttemptCount(), originalStartTimeString, deltaString); + retryHostRoleCommand(hrc); + } + } + } + } + } + + /** + * Private method to determine if a Host Role Command can be retried. + * @param hrc Host Role Command entity. + * @return True if can be retried, otherwise false. + */ + private boolean canRetryCommand(HostRoleCommandEntity hrc) { + if (hrc.isRetryAllowed() && !hrc.isFailureAutoSkipped()) { + // Important not to retry some steps during RU/EU like "Finalize Upgrade Pre-Check", "Execute HDFS Finalize", and "Save Cluster State". + // These elements are expected to be in lowercase already + if (null != hrc.getCustomCommandName()) { + for (String s : this.CUSTOM_COMMAND_NAMES_TO_IGNORE) { + if (hrc.getCustomCommandName().toLowerCase().contains(s)){ + return false; + } + } + } + if (null != hrc.getCommandDetail()) { + for (String s : this.COMMAND_DETAILS_TO_IGNORE) { + if (hrc.getCommandDetail().toLowerCase().contains(s)) { + return false; + } + } + } + return true; + } + return false; + } + + /** + * Private method to retry a Host Role Command by changing its status back to + * {@link org.apache.ambari.server.actionmanager.HostRoleStatus#PENDING} so + * that {@link org.apache.ambari.server.orm.DBAccessorImpl} can retry it. + * @param hrc Host Role Command entity + */ + private void retryHostRoleCommand(HostRoleCommandEntity hrc) { + hrc.setStatus(HostRoleStatus.PENDING); + hrc.setStartTime(-1L); + // Don't change the original start time. + hrc.setLastAttemptTime(-1L); + + // This will invalidate the cache, as expected. + m_hostRoleCommandDAO.merge(hrc); + } +} http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/java/org/apache/ambari/server/topology/HostRequest.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/java/org/apache/ambari/server/topology/HostRequest.java b/ambari-server/src/main/java/org/apache/ambari/server/topology/HostRequest.java index 9eb514a..00ecb98 100644 --- a/ambari-server/src/main/java/org/apache/ambari/server/topology/HostRequest.java +++ b/ambari-server/src/main/java/org/apache/ambari/server/topology/HostRequest.java @@ -298,6 +298,7 @@ public class HostRequest implements Comparable { logicalTask.setCustomCommandName(physicalTask.getCustomCommandName()); //todo: once we retry on failures, start/end times could span multiple physical tasks logicalTask.setStartTime(physicalTask.getStartTime()); + logicalTask.setOriginalStartTime(physicalTask.getOriginalStartTime()); logicalTask.setEndTime(physicalTask.getEndTime()); logicalTask.setErrorLog(physicalTask.getErrorLog()); logicalTask.setExitCode(physicalTask.getExitCode()); @@ -343,6 +344,7 @@ public class HostRequest implements Comparable { entity.setCustomCommandName(physicalTask.getCustomCommandName()); //todo: once we retry on failures, start/end times could span multiple physical tasks entity.setStartTime(physicalTask.getStartTime()); + entity.setOriginalStartTime(physicalTask.getOriginalStartTime()); entity.setEndTime(physicalTask.getEndTime()); entity.setErrorLog(physicalTask.getErrorLog()); entity.setExitcode(physicalTask.getExitCode()); http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/java/org/apache/ambari/server/topology/LogicalRequest.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/java/org/apache/ambari/server/topology/LogicalRequest.java b/ambari-server/src/main/java/org/apache/ambari/server/topology/LogicalRequest.java index 82edbcf..6de615a 100644 --- a/ambari-server/src/main/java/org/apache/ambari/server/topology/LogicalRequest.java +++ b/ambari-server/src/main/java/org/apache/ambari/server/topology/LogicalRequest.java @@ -47,7 +47,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** - * Logical Request implementation. + * Logical Request implementation used to provision a cluster deployed by Blueprints. */ public class LogicalRequest extends Request { http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/java/org/apache/ambari/server/upgrade/UpgradeCatalog240.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/java/org/apache/ambari/server/upgrade/UpgradeCatalog240.java b/ambari-server/src/main/java/org/apache/ambari/server/upgrade/UpgradeCatalog240.java index 7b83710..38a3614 100644 --- a/ambari-server/src/main/java/org/apache/ambari/server/upgrade/UpgradeCatalog240.java +++ b/ambari-server/src/main/java/org/apache/ambari/server/upgrade/UpgradeCatalog240.java @@ -69,6 +69,7 @@ public class UpgradeCatalog240 extends AbstractUpgradeCatalog { protected static final String PERMISSION_ID_COL = "permission_name"; protected static final String SORT_ORDER_COL = "sort_order"; protected static final String REPO_VERSION_TABLE = "repo_version"; + protected static final String HOST_ROLE_COMMAND_TABLE = "host_role_command"; protected static final String SERVICE_COMPONENT_DS_TABLE = "servicecomponentdesiredstate"; protected static final String HOST_COMPONENT_DS_TABLE = "hostcomponentdesiredstate"; protected static final String HOST_COMPONENT_STATE_TABLE = "hostcomponentstate"; @@ -637,4 +638,18 @@ public class UpgradeCatalog240 extends AbstractUpgradeCatalog { dbAccessor.addColumn(SERVICE_COMPONENT_DESIRED_STATE_TABLE, new DBColumnInfo(DESIRED_VERSION_COLUMN_NAME, String.class, 255, State.UNKNOWN.toString(), false)); } + + /** + * Alter host_role_command table to add original_start_time, which is needed because the start_time column now + * allows overriding the value in ActionScheduler.java + * @throws SQLException + */ + private void updateHostRoleCommandTable() throws SQLException { + final String columnName = "original_start_time"; + DBColumnInfo originalStartTimeColumn = new DBColumnInfo(columnName, Long.class, null, -1L, true); + dbAccessor.addColumn(HOST_ROLE_COMMAND_TABLE, originalStartTimeColumn); + dbAccessor.executeQuery("UPDATE " + HOST_ROLE_COMMAND_TABLE + " SET original_start_time = start_time", false); + dbAccessor.executeQuery("UPDATE " + HOST_ROLE_COMMAND_TABLE + " SET original_start_time=-1 WHERE original_start_time IS NULL"); + dbAccessor.setColumnNullable(HOST_ROLE_COMMAND_TABLE, columnName, false); + } } http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/resources/Ambari-DDL-MySQL-CREATE.sql ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/Ambari-DDL-MySQL-CREATE.sql b/ambari-server/src/main/resources/Ambari-DDL-MySQL-CREATE.sql index e5a1fb1..b306c0a 100644 --- a/ambari-server/src/main/resources/Ambari-DDL-MySQL-CREATE.sql +++ b/ambari-server/src/main/resources/Ambari-DDL-MySQL-CREATE.sql @@ -239,6 +239,7 @@ CREATE TABLE host_role_command ( role_command VARCHAR(255), stage_id BIGINT NOT NULL, start_time BIGINT NOT NULL, + original_start_time BIGINT NOT NULL, end_time BIGINT, status VARCHAR(255), auto_skip_on_failure SMALLINT DEFAULT 0 NOT NULL, http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/resources/Ambari-DDL-Oracle-CREATE.sql ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/Ambari-DDL-Oracle-CREATE.sql b/ambari-server/src/main/resources/Ambari-DDL-Oracle-CREATE.sql index 5df4de6..37744f8 100644 --- a/ambari-server/src/main/resources/Ambari-DDL-Oracle-CREATE.sql +++ b/ambari-server/src/main/resources/Ambari-DDL-Oracle-CREATE.sql @@ -229,6 +229,7 @@ CREATE TABLE host_role_command ( role_command VARCHAR2(255) NULL, stage_id NUMBER(19) NOT NULL, start_time NUMBER(19) NOT NULL, + original_start_time NUMBER(19) NOT NULL, end_time NUMBER(19), status VARCHAR2(255) NULL, auto_skip_on_failure NUMBER(1) DEFAULT 0 NOT NULL, http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/resources/Ambari-DDL-Postgres-CREATE.sql ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/Ambari-DDL-Postgres-CREATE.sql b/ambari-server/src/main/resources/Ambari-DDL-Postgres-CREATE.sql index 9a213b5..eba1745 100644 --- a/ambari-server/src/main/resources/Ambari-DDL-Postgres-CREATE.sql +++ b/ambari-server/src/main/resources/Ambari-DDL-Postgres-CREATE.sql @@ -239,6 +239,7 @@ CREATE TABLE host_role_command ( role VARCHAR(255), stage_id BIGINT NOT NULL, start_time BIGINT NOT NULL, + original_start_time BIGINT NOT NULL, end_time BIGINT, status VARCHAR(255), auto_skip_on_failure SMALLINT DEFAULT 0 NOT NULL, http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/resources/Ambari-DDL-Postgres-EMBEDDED-CREATE.sql ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/Ambari-DDL-Postgres-EMBEDDED-CREATE.sql b/ambari-server/src/main/resources/Ambari-DDL-Postgres-EMBEDDED-CREATE.sql index 53d4bf5..cad0a39 100644 --- a/ambari-server/src/main/resources/Ambari-DDL-Postgres-EMBEDDED-CREATE.sql +++ b/ambari-server/src/main/resources/Ambari-DDL-Postgres-EMBEDDED-CREATE.sql @@ -270,6 +270,7 @@ CREATE TABLE ambari.host_role_command ( role VARCHAR(255), stage_id BIGINT NOT NULL, start_time BIGINT NOT NULL, + original_start_time BIGINT NOT NULL, end_time BIGINT, status VARCHAR(255), auto_skip_on_failure SMALLINT DEFAULT 0 NOT NULL, http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/resources/Ambari-DDL-SQLAnywhere-CREATE.sql ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/Ambari-DDL-SQLAnywhere-CREATE.sql b/ambari-server/src/main/resources/Ambari-DDL-SQLAnywhere-CREATE.sql index c4e5031..346af50 100644 --- a/ambari-server/src/main/resources/Ambari-DDL-SQLAnywhere-CREATE.sql +++ b/ambari-server/src/main/resources/Ambari-DDL-SQLAnywhere-CREATE.sql @@ -228,6 +228,7 @@ CREATE TABLE host_role_command ( role_command VARCHAR(255), stage_id NUMERIC(19) NOT NULL, start_time NUMERIC(19) NOT NULL, + original_start_time NUMERIC(19) NOT NULL, end_time NUMERIC(19), status VARCHAR(255), auto_skip_on_failure SMALLINT DEFAULT 0 NOT NULL, http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/main/resources/Ambari-DDL-SQLServer-CREATE.sql ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/Ambari-DDL-SQLServer-CREATE.sql b/ambari-server/src/main/resources/Ambari-DDL-SQLServer-CREATE.sql index a50d21d..e238a76 100644 --- a/ambari-server/src/main/resources/Ambari-DDL-SQLServer-CREATE.sql +++ b/ambari-server/src/main/resources/Ambari-DDL-SQLServer-CREATE.sql @@ -250,6 +250,7 @@ CREATE TABLE host_role_command ( role VARCHAR(255), stage_id BIGINT NOT NULL, start_time BIGINT NOT NULL, + original_start_time BIGINT NOT NULL, end_time BIGINT, status VARCHAR(255), auto_skip_on_failure SMALLINT DEFAULT 0 NOT NULL, http://git-wip-us.apache.org/repos/asf/ambari/blob/79b9e570/ambari-server/src/test/java/org/apache/ambari/server/state/services/RetryUpgradeActionServiceTest.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/test/java/org/apache/ambari/server/state/services/RetryUpgradeActionServiceTest.java b/ambari-server/src/test/java/org/apache/ambari/server/state/services/RetryUpgradeActionServiceTest.java new file mode 100644 index 0000000..2fb57d7 --- /dev/null +++ b/ambari-server/src/test/java/org/apache/ambari/server/state/services/RetryUpgradeActionServiceTest.java @@ -0,0 +1,284 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.ambari.server.state.services; + +import com.google.inject.Guice; +import com.google.inject.Injector; +import org.apache.ambari.server.AmbariException; +import org.apache.ambari.server.Role; +import org.apache.ambari.server.RoleCommand; +import org.apache.ambari.server.actionmanager.HostRoleStatus; +import org.apache.ambari.server.orm.GuiceJpaInitializer; +import org.apache.ambari.server.orm.InMemoryDefaultTestModule; +import org.apache.ambari.server.orm.OrmTestHelper; +import org.apache.ambari.server.orm.dao.HostRoleCommandDAO; +import org.apache.ambari.server.orm.dao.RepositoryVersionDAO; +import org.apache.ambari.server.orm.dao.RequestDAO; +import org.apache.ambari.server.orm.dao.StackDAO; +import org.apache.ambari.server.orm.dao.StageDAO; +import org.apache.ambari.server.orm.dao.UpgradeDAO; +import org.apache.ambari.server.orm.entities.HostRoleCommandEntity; +import org.apache.ambari.server.orm.entities.RepositoryVersionEntity; +import org.apache.ambari.server.orm.entities.RequestEntity; +import org.apache.ambari.server.orm.entities.StackEntity; +import org.apache.ambari.server.orm.entities.StageEntity; +import org.apache.ambari.server.orm.entities.UpgradeEntity; +import org.apache.ambari.server.state.Cluster; +import org.apache.ambari.server.state.Clusters; +import org.apache.ambari.server.state.RepositoryVersionState; +import org.apache.ambari.server.state.StackId; +import org.apache.ambari.server.state.stack.upgrade.Direction; +import org.apache.ambari.server.state.stack.upgrade.UpgradeType; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; + +/** + * Tests {@link org.apache.ambari.server.state.services.RetryUpgradeActionService}. + */ +public class RetryUpgradeActionServiceTest { + + private Injector injector; + + private StackDAO stackDAO; + private Clusters clusters; + private RepositoryVersionDAO repoVersionDAO; + private UpgradeDAO upgradeDAO; + private RequestDAO requestDAO; + private StageDAO stageDAO; + private HostRoleCommandDAO hostRoleCommandDAO; + private OrmTestHelper helper; + + // Instance variables shared by all tests + String clusterName = "c1"; + Cluster cluster; + StackEntity stackEntity220; + StackId stack220; + Long upgradeRequestId = 1L; + Long stageId = 1L; + + @Before + public void before() throws NoSuchFieldException, IllegalAccessException { + injector = Guice.createInjector(new InMemoryDefaultTestModule()); + injector.getInstance(GuiceJpaInitializer.class); + + stackDAO = injector.getInstance(StackDAO.class); + clusters = injector.getInstance(Clusters.class); + repoVersionDAO = injector.getInstance(RepositoryVersionDAO.class); + upgradeDAO = injector.getInstance(UpgradeDAO.class); + requestDAO = injector.getInstance(RequestDAO.class); + stageDAO = injector.getInstance(StageDAO.class); + hostRoleCommandDAO = injector.getInstance(HostRoleCommandDAO.class); + helper = injector.getInstance(OrmTestHelper.class); + } + + /** + * Test the gauva service allows retrying certain failed actions during a stack upgrade. + * Case 1: No cluster => no-op + * Case 2: Cluster and valid timeout, but no active upgrade => no-op + * Case 3: Cluster with an active upgrade, but no HOLDING_FAILED|HOLDING_TIMEDOUT commands => no-op + * Case 4: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that + * does NOT meet conditions to be retried => no-op + * Case 5: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that + * DOES meet conditions to be retried => retries the task + * Case 6: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that + * was already retried and has now expired => no-op + * Case 7: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED, but it is a critical task + * during Finalize Cluster, which should not be retried => no-op + * @throws Exception + */ + @Test + public void test() throws Exception { + int timeoutMins = 1; + RetryUpgradeActionService service = injector.getInstance(RetryUpgradeActionService.class); + service.startUp(); + + // Case 1: No cluster + service.runOneIteration(); + + // Case 2: Cluster and valid timeout, but no active upgrade + createCluster(); + service.setMaxTimeout(timeoutMins); + service.runOneIteration(); + + // Case 3: Cluster with an active upgrade, but no HOLDING_FAILED|HOLDING_TIMEDOUT commands. + prepareUpgrade(); + + // Run the service + service.runOneIteration(); + + // Assert all commands in PENDING + List commands = hostRoleCommandDAO.findAll(); + Assert.assertTrue(!commands.isEmpty()); + for (HostRoleCommandEntity hrc : commands) { + if (hrc.getStatus() == HostRoleStatus.PENDING) { + Assert.fail("Did not expect any HostRoleCommands to be PENDING"); + } + } + + // Case 4: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that does NOT meet conditions to be retried. + List stages = stageDAO.findByStageIds(upgradeRequestId, new HashSet(){{ add(stageId); }}); + Assert.assertTrue(!stages.isEmpty() && stages.size() == 1); + StageEntity stageEntity = stages.get(0); + + HostRoleCommandEntity hrc2 = new HostRoleCommandEntity(); + hrc2.setStage(stageEntity); + hrc2.setStatus(HostRoleStatus.HOLDING_FAILED); + hrc2.setRole(Role.ZOOKEEPER_SERVER); + hrc2.setRoleCommand(RoleCommand.RESTART); + hrc2.setRetryAllowed(false); + hrc2.setAutoSkipOnFailure(false); + stageEntity.getHostRoleCommands().add(hrc2); + + hostRoleCommandDAO.create(hrc2); + stageDAO.merge(stageEntity); + + // Run the service + service.runOneIteration(); + + commands = hostRoleCommandDAO.findAll(); + Assert.assertTrue(!commands.isEmpty() && commands.size() == 2); + for (HostRoleCommandEntity hrc : commands) { + if (hrc.getStatus() == HostRoleStatus.PENDING) { + Assert.fail("Did not expect any HostRoleCommands to be PENDING"); + } + } + + // Case 5: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that DOES meet conditions to be retried. + long now = System.currentTimeMillis(); + hrc2.setRetryAllowed(true); + hrc2.setOriginalStartTime(now); + hostRoleCommandDAO.merge(hrc2); + + // Run the service + service.runOneIteration(); + + // Ensure that task 2 transitioned from HOLDING_FAILED to PENDING + Assert.assertEquals(HostRoleStatus.PENDING, hostRoleCommandDAO.findByPK(hrc2.getTaskId()).getStatus()); + + // Case 6: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that was already retried and has now expired. + now = System.currentTimeMillis(); + hrc2.setOriginalStartTime(now - (timeoutMins * 60000) - 1); + hrc2.setStatus(HostRoleStatus.HOLDING_FAILED); + hostRoleCommandDAO.merge(hrc2); + + // Run the service + service.runOneIteration(); + + Assert.assertEquals(HostRoleStatus.HOLDING_FAILED, hostRoleCommandDAO.findByPK(hrc2.getTaskId()).getStatus()); + + // Case 7: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED, but it is a critical task + // during Finalize Cluster, which should not be retried. + now = System.currentTimeMillis(); + hrc2.setOriginalStartTime(now); + hrc2.setStatus(HostRoleStatus.HOLDING_FAILED); + hrc2.setCustomCommandName("org.apache.ambari.server.serveraction.upgrades.FinalizeUpgradeAction"); + hostRoleCommandDAO.merge(hrc2); + + // Run the service + service.runOneIteration(); + + Assert.assertEquals(HostRoleStatus.HOLDING_FAILED, hostRoleCommandDAO.findByPK(hrc2.getTaskId()).getStatus()); + } + + /** + * Create a cluster for stack HDP 2.2.0 + * @throws AmbariException + */ + private void createCluster() throws AmbariException { + stackEntity220 = stackDAO.find("HDP", "2.2.0"); + stack220 = new StackId("HDP-2.2.0"); + + clusters.addCluster(clusterName, stack220); + cluster = clusters.getCluster("c1"); + + RepositoryVersionEntity repoVersionEntity = new RepositoryVersionEntity(); + repoVersionEntity.setDisplayName("Initial Version"); + repoVersionEntity.setOperatingSystems(""); + repoVersionEntity.setStack(stackEntity220); + repoVersionEntity.setVersion("2.2.0.0"); + repoVersionDAO.create(repoVersionEntity); + + helper.getOrCreateRepositoryVersion(stack220, stack220.getStackVersion()); + + cluster.createClusterVersion(stack220, stack220.getStackVersion(), "admin", RepositoryVersionState.INSTALLING); + cluster.transitionClusterVersion(stack220, stack220.getStackVersion(), RepositoryVersionState.CURRENT); + } + + /** + * Create a new repo version, plus the request and stage objects needed for a ROLLING stack upgrade. + * @throws AmbariException + */ + private void prepareUpgrade() throws AmbariException { + RepositoryVersionEntity repoVersionEntity = new RepositoryVersionEntity(); + repoVersionEntity.setDisplayName("Version to Upgrade To"); + repoVersionEntity.setOperatingSystems(""); + repoVersionEntity.setStack(stackEntity220); + repoVersionEntity.setVersion("2.2.0.1"); + repoVersionDAO.create(repoVersionEntity); + + helper.getOrCreateRepositoryVersion(stack220, stack220.getStackVersion()); + + RequestEntity requestEntity = new RequestEntity(); + requestEntity.setRequestId(upgradeRequestId); + requestEntity.setClusterId(cluster.getClusterId()); + requestEntity.setStatus(HostRoleStatus.PENDING); + requestDAO.create(requestEntity); + + // Create the stage and add it to the request + StageEntity stageEntity = new StageEntity(); + stageEntity.setRequest(requestEntity); + stageEntity.setClusterId(cluster.getClusterId()); + stageEntity.setRequestId(upgradeRequestId); + stageEntity.setStageId(stageId); + requestEntity.setStages(Collections.singletonList(stageEntity)); + stageDAO.create(stageEntity); + requestDAO.merge(requestEntity); + + UpgradeEntity upgrade = new UpgradeEntity(); + upgrade.setId(1L); + upgrade.setRequestId(upgradeRequestId); + upgrade.setClusterId(cluster.getClusterId()); + upgrade.setUpgradePackage("some-name"); + upgrade.setUpgradeType(UpgradeType.ROLLING); + upgrade.setDirection(Direction.UPGRADE); + upgrade.setFromVersion("2.2.0.0"); + upgrade.setToVersion("2.2.0.1"); + upgradeDAO.create(upgrade); + + cluster.setUpgradeEntity(upgrade); + + // Create the task and add it to the stage + HostRoleCommandEntity hrc1 = new HostRoleCommandEntity(); + + hrc1.setStage(stageEntity); + hrc1.setStatus(HostRoleStatus.COMPLETED); + hrc1.setRole(Role.ZOOKEEPER_SERVER); + hrc1.setRoleCommand(RoleCommand.RESTART); + + stageEntity.setHostRoleCommands(new ArrayList()); + stageEntity.getHostRoleCommands().add(hrc1); + hostRoleCommandDAO.create(hrc1); + stageDAO.merge(stageEntity); + } +}