Return-Path: X-Original-To: apmail-hadoop-common-commits-archive@www.apache.org Delivered-To: apmail-hadoop-common-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 2514618B5B for ; Thu, 12 Nov 2015 19:20:51 +0000 (UTC) Received: (qmail 23729 invoked by uid 500); 12 Nov 2015 19:20:50 -0000 Delivered-To: apmail-hadoop-common-commits-archive@hadoop.apache.org Received: (qmail 23670 invoked by uid 500); 12 Nov 2015 19:20:50 -0000 Mailing-List: contact common-commits-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: common-dev@hadoop.apache.org Delivered-To: mailing list common-commits@hadoop.apache.org Received: (qmail 23661 invoked by uid 99); 12 Nov 2015 19:20:50 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 12 Nov 2015 19:20:50 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 7A99CE5E25; Thu, 12 Nov 2015 19:20:50 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: wangda@apache.org To: common-commits@hadoop.apache.org Message-Id: <510896aa025344e295e3765f5ee703ca@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: hadoop git commit: YARN-4347. Resource manager fails with Null pointer exception. (Jian He via wangda) Date: Thu, 12 Nov 2015 19:20:50 +0000 (UTC) Repository: hadoop Updated Branches: refs/heads/branch-2.7 adede3e53 -> ddce4c824 YARN-4347. Resource manager fails with Null pointer exception. (Jian He via wangda) Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/ddce4c82 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/ddce4c82 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/ddce4c82 Branch: refs/heads/branch-2.7 Commit: ddce4c82458d603667e54136d59f0dce10140018 Parents: adede3e Author: Wangda Tan Authored: Thu Nov 12 11:20:38 2015 -0800 Committer: Wangda Tan Committed: Thu Nov 12 11:20:38 2015 -0800 ---------------------------------------------------------------------- hadoop-yarn-project/CHANGES.txt | 2 + .../server/resourcemanager/rmapp/RMAppImpl.java | 2 +- .../rmapp/attempt/RMAppAttemptImpl.java | 26 ++++++++++-- .../TestWorkPreservingRMRestart.java | 42 ++++++++++++++++++++ 4 files changed, 68 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/ddce4c82/hadoop-yarn-project/CHANGES.txt ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 96ac611..f961b0c 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -18,6 +18,8 @@ Release 2.7.3 - UNRELEASED YARN-3840. Resource Manager web ui issue when sorting application by id (with application having id > 9999) (Mohammad Shahid Khan via jianhe) + YARN-4347. Resource manager fails with Null pointer exception. (Jian He via wangda) + Release 2.7.2 - UNRELEASED INCOMPATIBLE CHANGES http://git-wip-us.apache.org/repos/asf/hadoop/blob/ddce4c82/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java index a6fc58b..23abf4a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java @@ -1282,7 +1282,7 @@ public class RMAppImpl implements RMApp, Recoverable { || appState == RMAppState.KILLED; } - private RMAppState getRecoveredFinalState() { + public RMAppState getRecoveredFinalState() { return this.recoveredFinalState; } http://git-wip-us.apache.org/repos/asf/hadoop/blob/ddce4c82/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java index 9b8bd88..e07654d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java @@ -81,6 +81,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEventType; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppFailedAttemptEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl; +import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptContainerFinishedEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptRegistrationEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptStatusupdateEvent; @@ -1039,6 +1040,9 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable { @Override public RMAppAttemptState transition(RMAppAttemptImpl appAttempt, RMAppAttemptEvent event) { + RMApp rmApp = appAttempt.rmContext.getRMApps().get( + appAttempt.getAppAttemptId().getApplicationId()); + /* * If last attempt recovered final state is null .. it means attempt was * started but AM container may or may not have started / finished. @@ -1046,8 +1050,6 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable { */ if (appAttempt.recoveredFinalState != null) { appAttempt.progress = 1.0f; - RMApp rmApp =appAttempt.rmContext.getRMApps().get( - appAttempt.getAppAttemptId().getApplicationId()); // We will replay the final attempt only if last attempt is in final // state but application is not in final state. if (rmApp.getCurrentAppAttempt() == appAttempt @@ -1060,7 +1062,24 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable { appAttempt, event); } return appAttempt.recoveredFinalState; - } else { + } else if (RMAppImpl.isAppInFinalState(rmApp)) { + // Somehow attempt final state was not saved but app final state was saved. + // Skip adding the attempt into scheduler + RMAppState appState = ((RMAppImpl) rmApp).getRecoveredFinalState(); + LOG.warn(rmApp.getApplicationId() + " final state (" + appState + + ") was recorded, but " + appAttempt.applicationAttemptId + + " final state (" + appAttempt.recoveredFinalState + + ") was not recorded."); + switch (appState) { + case FINISHED: + return RMAppAttemptState.FINISHED; + case FAILED: + return RMAppAttemptState.FAILED; + case KILLED: + return RMAppAttemptState.KILLED; + } + return RMAppAttemptState.FAILED; + } else{ // Add the current attempt to the scheduler. if (appAttempt.rmContext.isWorkPreservingRecoveryEnabled()) { // Need to register an app attempt before AM can register @@ -1094,6 +1113,7 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable { } } + private void rememberTargetTransitions(RMAppAttemptEvent event, Object transitionToDo, RMAppAttemptState targetFinalState) { transitionTodo = transitionToDo; http://git-wip-us.apache.org/repos/asf/hadoop/blob/ddce4c82/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestWorkPreservingRMRestart.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestWorkPreservingRMRestart.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestWorkPreservingRMRestart.java index 8283844..3103839 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestWorkPreservingRMRestart.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestWorkPreservingRMRestart.java @@ -60,6 +60,7 @@ import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus; import org.apache.hadoop.yarn.server.resourcemanager.TestRMRestart.TestSecurityMockRM; import org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore; import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState; +import org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationAttemptStateData; import org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState; @@ -1160,4 +1161,45 @@ public class TestWorkPreservingRMRestart extends ParameterizedSchedulerTestBase nm1.setResourceTrackerService(rm2.getResourceTrackerService()); rm2.start(); } + + // Test that if application state was saved, but attempt state was not saved. + // RM should start correctly. + @Test (timeout = 20000) + public void testAppStateSavedButAttemptStateNotSaved() throws Exception { + MemoryRMStateStore memStore = new MemoryRMStateStore() { + @Override public synchronized void updateApplicationAttemptStateInternal( + ApplicationAttemptId appAttemptId, + ApplicationAttemptStateData attemptState) { + // do nothing; + // simulate the failure that attempt final state is not saved. + } + }; + memStore.init(conf); + rm1 = new MockRM(conf, memStore); + rm1.start(); + + MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService()); + nm1.registerNode(); + + RMApp app1 = rm1.submitApp(200); + MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1); + MockRM.finishAMAndVerifyAppState(app1, rm1, nm1, am1); + + ApplicationStateData appSavedState = + memStore.getState().getApplicationState().get(app1.getApplicationId()); + + // check that app state is saved. + assertEquals(RMAppState.FINISHED, appSavedState.getState()); + // check that attempt state is not saved. + assertNull(appSavedState.getAttempt(am1.getApplicationAttemptId()).getState()); + + rm2 = new MockRM(conf, memStore); + rm2.start(); + RMApp recoveredApp1 = + rm2.getRMContext().getRMApps().get(app1.getApplicationId()); + + assertEquals(RMAppState.FINISHED, recoveredApp1.getState()); + // check that attempt state is recovered correctly. + assertEquals(RMAppAttemptState.FINISHED, recoveredApp1.getCurrentAppAttempt().getState()); + } }