Return-Path: X-Original-To: apmail-hadoop-common-commits-archive@www.apache.org Delivered-To: apmail-hadoop-common-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id D406D18D50 for ; Tue, 8 Sep 2015 00:47:42 +0000 (UTC) Received: (qmail 40138 invoked by uid 500); 8 Sep 2015 00:47:42 -0000 Delivered-To: apmail-hadoop-common-commits-archive@hadoop.apache.org Received: (qmail 40069 invoked by uid 500); 8 Sep 2015 00:47:42 -0000 Mailing-List: contact common-commits-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: common-dev@hadoop.apache.org Delivered-To: mailing list common-commits@hadoop.apache.org Received: (qmail 40060 invoked by uid 99); 8 Sep 2015 00:47:42 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 08 Sep 2015 00:47:42 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 767DAE00D8; Tue, 8 Sep 2015 00:47:42 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: xgong@apache.org To: common-commits@hadoop.apache.org Message-Id: <73adb76f0a654d5db7c846ba9c3d95b9@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: hadoop git commit: YARN-4087. Followup fixes after YARN-2019 regarding RM behavior when state-store error occurs. Contributed by Jian He Date: Tue, 8 Sep 2015 00:47:42 +0000 (UTC) Repository: hadoop Updated Branches: refs/heads/branch-2.7 96b9455c6 -> b55fb0ac4 YARN-4087. Followup fixes after YARN-2019 regarding RM behavior when state-store error occurs. Contributed by Jian He (cherry picked from commit 9b78e6e33d8c117c1e909df414f20d9db56efe4b) (cherry picked from commit a0b7ef15d0663076b65ae3f53271b54e42308bfb) Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/b55fb0ac Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/b55fb0ac Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/b55fb0ac Branch: refs/heads/branch-2.7 Commit: b55fb0ac44c87c809c32de54654724b1374b66ae Parents: 96b9455 Author: Xuan Authored: Mon Sep 7 17:45:47 2015 -0700 Committer: Xuan Committed: Mon Sep 7 17:47:32 2015 -0700 ---------------------------------------------------------------------- hadoop-yarn-project/CHANGES.txt | 3 +++ .../apache/hadoop/yarn/conf/YarnConfiguration.java | 2 +- .../src/main/resources/yarn-default.xml | 5 ++++- .../resourcemanager/recovery/RMStateStore.java | 15 +++++++++------ 4 files changed, 17 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/b55fb0ac/hadoop-yarn-project/CHANGES.txt ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index cc5a02f..5463d37 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -76,6 +76,9 @@ Release 2.7.2 - UNRELEASED YARN-4105. Capacity Scheduler headroom for DRF is wrong (Chang Li via jlowe) + YARN-4087. Followup fixes after YARN-2019 regarding RM behavior when + state-store error occurs. (Jian He via xgong) + Release 2.7.1 - 2015-07-06 INCOMPATIBLE CHANGES http://git-wip-us.apache.org/repos/asf/hadoop/blob/b55fb0ac/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index f98db44..7b71be6 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -402,7 +402,7 @@ public class YarnConfiguration extends Configuration { public static final boolean DEFAULT_RM_RECOVERY_ENABLED = false; public static final String YARN_FAIL_FAST = YARN_PREFIX + "fail-fast"; - public static final boolean DEFAULT_YARN_FAIL_FAST = true; + public static final boolean DEFAULT_YARN_FAIL_FAST = false; public static final String RM_FAIL_FAST = RM_PREFIX + "fail-fast"; http://git-wip-us.apache.org/repos/asf/hadoop/blob/b55fb0ac/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 4827b9a..cb3ad97 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -298,9 +298,12 @@ Should YARN fail fast if it encounters any errors. + This is a global config for all other components including RM,NM etc. + If no value is set for component-specific config (e.g yarn.resourcemanager.fail-fast), + this value will be the default. yarn.fail-fast - true + false http://git-wip-us.apache.org/repos/asf/hadoop/blob/b55fb0ac/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java index 95977ea..071b5c6 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java @@ -44,6 +44,7 @@ import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext; import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationSubmissionContextPBImpl; +import org.apache.hadoop.yarn.conf.HAUtil; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.event.AsyncDispatcher; import org.apache.hadoop.yarn.event.Dispatcher; @@ -857,18 +858,20 @@ public abstract class RMStateStore extends AbstractService { */ protected void notifyStoreOperationFailed(Exception failureCause) { LOG.error("State store operation failed ", failureCause); - if (failureCause instanceof StoreFencedException) { + if (HAUtil.isHAEnabled(getConfig())) { + LOG.warn("State-store fenced ! Transitioning RM to standby"); updateFencedState(); Thread standByTransitionThread = new Thread(new StandByTransitionThread()); standByTransitionThread.setName("StandByTransitionThread Handler"); standByTransitionThread.start(); + } else if (YarnConfiguration.shouldRMFailFast(getConfig())) { + LOG.fatal("Fail RM now due to state-store error!"); + rmDispatcher.getEventHandler().handle( + new RMFatalEvent(RMFatalEventType.STATE_STORE_OP_FAILED, + failureCause)); } else { - if (YarnConfiguration.shouldRMFailFast(getConfig())) { - rmDispatcher.getEventHandler().handle( - new RMFatalEvent(RMFatalEventType.STATE_STORE_OP_FAILED, - failureCause)); - } + LOG.warn("Skip the state-store error."); } }