hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From vino...@apache.org
Subject hadoop git commit: YARN-2816. NM fail to start with NPE during container recovery. Contributed by Zhihai Xu (cherry picked from commit 49c38898b0be64fc686d039ed2fb2dea1378df02)
Date Fri, 28 Aug 2015 01:35:27 GMT
Repository: hadoop
Updated Branches:
  refs/heads/branch-2.6.1 81ba30211 -> f83d89894


YARN-2816. NM fail to start with NPE during container recovery. Contributed by Zhihai Xu
(cherry picked from commit 49c38898b0be64fc686d039ed2fb2dea1378df02)

(cherry picked from commit ad140d1fc831735fb9335e27b38d2fc040847af1)
(cherry picked from commit 85b23c323c80c5303bd0b7bdb066258792ca67d8)


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/f83d8989
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/f83d8989
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/f83d8989

Branch: refs/heads/branch-2.6.1
Commit: f83d89894425b601ccb65d72bfce3dab12a9d898
Parents: 81ba302
Author: Jason Lowe <jlowe@apache.org>
Authored: Fri Nov 14 21:25:59 2014 +0000
Committer: Vinod Kumar Vavilapalli <vinodkv@apache.org>
Committed: Thu Aug 27 18:32:59 2015 -0700

----------------------------------------------------------------------
 hadoop-yarn-project/CHANGES.txt                 |  3 +++
 .../recovery/NMLeveldbStateStoreService.java    | 24 +++++++++++++++++++-
 .../TestNMLeveldbStateStoreService.java         |  7 ++++++
 3 files changed, 33 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/f83d8989/hadoop-yarn-project/CHANGES.txt
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt
index 6691f6e..5e8e4f9 100644
--- a/hadoop-yarn-project/CHANGES.txt
+++ b/hadoop-yarn-project/CHANGES.txt
@@ -15,6 +15,9 @@ Release 2.6.1 - UNRELEASED
     YARN-2856. Fixed RMAppImpl to handle ATTEMPT_KILLED event at ACCEPTED state
     on app recovery. (Rohith Sharmaks via jianhe)
 
+    YARN-2816. NM fail to start with NPE during container recovery (Zhihai Xu
+    via jlowe)
+
 Release 2.6.0 - 2014-11-18
 
   INCOMPATIBLE CHANGES

http://git-wip-us.apache.org/repos/asf/hadoop/blob/f83d8989/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java
index 7cf4921..9d54688 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java
@@ -146,6 +146,8 @@ public class NMLeveldbStateStoreService extends NMStateStoreService {
       throws IOException {
     ArrayList<RecoveredContainerState> containers =
         new ArrayList<RecoveredContainerState>();
+    ArrayList<ContainerId> containersToRemove =
+              new ArrayList<ContainerId>();
     LeveldbIterator iter = null;
     try {
       iter = new LeveldbIterator(db);
@@ -165,7 +167,14 @@ public class NMLeveldbStateStoreService extends NMStateStoreService {
         ContainerId containerId = ConverterUtils.toContainerId(
             key.substring(CONTAINERS_KEY_PREFIX.length(), idEndPos));
         String keyPrefix = key.substring(0, idEndPos+1);
-        containers.add(loadContainerState(containerId, iter, keyPrefix));
+        RecoveredContainerState rcs = loadContainerState(containerId,
+            iter, keyPrefix);
+        // Don't load container without StartContainerRequest
+        if (rcs.startRequest != null) {
+          containers.add(rcs);
+        } else {
+          containersToRemove.add(containerId);
+        }
       }
     } catch (DBException e) {
       throw new IOException(e);
@@ -175,6 +184,19 @@ public class NMLeveldbStateStoreService extends NMStateStoreService {
       }
     }
 
+    // remove container without StartContainerRequest
+    for (ContainerId containerId : containersToRemove) {
+      LOG.warn("Remove container " + containerId +
+          " with incomplete records");
+      try {
+        removeContainer(containerId);
+        // TODO: kill and cleanup the leaked container
+      } catch (IOException e) {
+        LOG.error("Unable to remove container " + containerId +
+            " in store", e);
+      }
+    }
+
     return containers;
   }
 

http://git-wip-us.apache.org/repos/asf/hadoop/blob/f83d8989/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java
index 438cec3..f7f43cc 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java
@@ -274,6 +274,13 @@ public class TestNMLeveldbStateStoreService {
     assertEquals(containerReq, rcs.getStartRequest());
     assertTrue(rcs.getDiagnostics().isEmpty());
 
+    // store a new container record without StartContainerRequest
+    ContainerId containerId1 = ContainerId.newContainerId(appAttemptId, 6);
+    stateStore.storeContainerLaunched(containerId1);
+    recoveredContainers = stateStore.loadContainersState();
+    // check whether the new container record is discarded
+    assertEquals(1, recoveredContainers.size());
+
     // launch the container, add some diagnostics, and verify recovered
     StringBuilder diags = new StringBuilder();
     stateStore.storeContainerLaunched(containerId);


Mime
View raw message