hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rkan...@apache.org
Subject hadoop git commit: YARN-8929. DefaultOOMHandler should only pick running containers to kill upon oom events (haibochen via rkanter)
Date Wed, 24 Oct 2018 20:16:03 GMT
Repository: hadoop
Updated Branches:
  refs/heads/trunk ebf8e1731 -> 69b328943


YARN-8929. DefaultOOMHandler should only pick running containers to kill upon oom events (haibochen via rkanter)


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/69b32894
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/69b32894
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/69b32894

Branch: refs/heads/trunk
Commit: 69b328943edf2f61c8fc139934420e3f10bf3813
Parents: ebf8e17
Author: Robert Kanter <rkanter@apache.org>
Authored: Wed Oct 24 13:15:50 2018 -0700
Committer: Robert Kanter <rkanter@apache.org>
Committed: Wed Oct 24 13:15:50 2018 -0700

----------------------------------------------------------------------
 .../linux/resources/CGroupsHandler.java         |   2 +-
 .../linux/resources/CGroupsHandlerImpl.java     |   4 +-
 .../linux/resources/DefaultOOMHandler.java      |  45 +-
 .../linux/resources/TestCGroupsHandlerImpl.java |   2 +-
 .../linux/resources/TestDefaultOOMHandler.java  | 434 +++++++++++++++----
 5 files changed, 389 insertions(+), 98 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/69b32894/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsHandler.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsHandler.java
index 9dc16c3..dcb0589 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsHandler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsHandler.java
@@ -71,7 +71,7 @@ public interface CGroupsHandler {
     }
   }
 
-  String CGROUP_FILE_TASKS = "tasks";
+  String CGROUP_PROCS_FILE = "cgroup.procs";
   String CGROUP_PARAM_CLASSID = "classid";
   String CGROUP_PARAM_BLKIO_WEIGHT = "weight";
 

http://git-wip-us.apache.org/repos/asf/hadoop/blob/69b32894/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsHandlerImpl.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsHandlerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsHandlerImpl.java
index a547e8f..050d0a8 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsHandlerImpl.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsHandlerImpl.java
@@ -347,7 +347,7 @@ class CGroupsHandlerImpl implements CGroupsHandler {
   public String getPathForCGroupTasks(CGroupController controller,
       String cGroupId) {
     return getPathForCGroup(controller, cGroupId)
-        + Path.SEPARATOR + CGROUP_FILE_TASKS;
+        + Path.SEPARATOR + CGROUP_PROCS_FILE;
   }
 
   @Override
@@ -603,7 +603,7 @@ class CGroupsHandlerImpl implements CGroupsHandler {
   public String getCGroupParam(CGroupController controller, String cGroupId,
       String param) throws ResourceHandlerException {
     String cGroupParamPath =
-        param.equals(CGROUP_FILE_TASKS) ?
+        param.equals(CGROUP_PROCS_FILE) ?
             getPathForCGroup(controller, cGroupId)
                 + Path.SEPARATOR + param :
         getPathForCGroupParam(controller, cGroupId, param);

http://git-wip-us.apache.org/repos/asf/hadoop/blob/69b32894/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/DefaultOOMHandler.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/DefaultOOMHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/DefaultOOMHandler.java
index 86137b5..844bb6c 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/DefaultOOMHandler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/DefaultOOMHandler.java
@@ -34,7 +34,7 @@ import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collections;
 
-import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_FILE_TASKS;
+import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_PROCS_FILE;
 import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES;
 import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_PARAM_MEMORY_OOM_CONTROL;
 import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_PARAM_MEMORY_USAGE_BYTES;
@@ -116,8 +116,10 @@ public class DefaultOOMHandler implements Runnable {
    * Currently the killing only succeeds for PGIDS.
    *
    * @param container Container to clean up
+   * @return true if the container is killed successfully, false otherwise
    */
-  private void sigKill(Container container) {
+  private boolean sigKill(Container container) {
+    boolean containerKilled = false;
     boolean finished = false;
     try {
       while (!finished) {
@@ -125,7 +127,7 @@ public class DefaultOOMHandler implements Runnable {
             cgroups.getCGroupParam(
                 CGroupsHandler.CGroupController.MEMORY,
                 container.getContainerId().toString(),
-                CGROUP_FILE_TASKS)
+                CGROUP_PROCS_FILE)
                 .split("\n");
         finished = true;
         for (String pid : pids) {
@@ -154,11 +156,17 @@ public class DefaultOOMHandler implements Runnable {
           LOG.debug("Interrupted while waiting for processes to disappear");
         }
       }
+      containerKilled = true;
     } catch (ResourceHandlerException ex) {
+      // the tasks file of the container may not be available because the
+      // container may not have been launched at this point when the root
+      // cgroup is under oom
       LOG.warn(String.format(
           "Cannot list more tasks in container %s to kill.",
           container.getContainerId()));
     }
+
+    return containerKilled;
   }
 
   /**
@@ -216,19 +224,34 @@ public class DefaultOOMHandler implements Runnable {
 
     ArrayList<ContainerCandidate> candidates = new ArrayList<>(0);
     for (Container container : context.getContainers().values()) {
+      if (!container.isRunning()) {
+        // skip containers that are not running yet because killing them
+        // won't release any memory to get us out of OOM.
+        continue;
+        // note even if it is indicated that the container is running from
+        // container.isRunning(), the container process might not have been
+        // running yet. From NM's perspective, a container is running as
+        // soon as the container launch is handed over the container executor
+      }
       candidates.add(
           new ContainerCandidate(container, isContainerOutOfLimit(container)));
     }
     Collections.sort(candidates);
+    if (candidates.isEmpty()) {
+      LOG.warn(
+          "Found no running containers to kill in order to release memory");
+    }
 
-    if (candidates.size() > 0) {
-      ContainerCandidate candidate = candidates.get(0);
-      sigKill(candidate.container);
-      String message = String.format(
-          "container %s killed by elastic cgroups OOM handler.",
-          candidate.container.getContainerId());
-      LOG.warn(message);
-      containerKilled = true;
+    // make sure one container is killed successfully to release memory
+    for(int i = 0; !containerKilled && i < candidates.size(); i++) {
+      ContainerCandidate candidate = candidates.get(i);
+      if (sigKill(candidate.container)) {
+        String message = String.format(
+            "container %s killed by elastic cgroups OOM handler.",
+            candidate.container.getContainerId());
+        LOG.warn(message);
+        containerKilled = true;
+      }
     }
     return containerKilled;
   }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/69b32894/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestCGroupsHandlerImpl.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestCGroupsHandlerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestCGroupsHandlerImpl.java
index 0d7c097..ea6fb52 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestCGroupsHandlerImpl.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestCGroupsHandlerImpl.java
@@ -266,7 +266,7 @@ public class TestCGroupsHandlerImpl {
     Assert.assertEquals(expectedPath, path);
 
     String expectedPathTasks = expectedPath + Path.SEPARATOR
-        + CGroupsHandler.CGROUP_FILE_TASKS;
+        + CGroupsHandler.CGROUP_PROCS_FILE;
     path = cGroupsHandler.getPathForCGroupTasks(controller, testCGroup);
     Assert.assertEquals(expectedPathTasks, path);
 

http://git-wip-us.apache.org/repos/asf/hadoop/blob/69b32894/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestDefaultOOMHandler.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestDefaultOOMHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestDefaultOOMHandler.java
index e239067..8a6ca74 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestDefaultOOMHandler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestDefaultOOMHandler.java
@@ -33,7 +33,7 @@ import org.junit.Test;
 import java.io.IOException;
 import java.util.concurrent.ConcurrentHashMap;
 
-import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_FILE_TASKS;
+import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_PROCS_FILE;
 import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES;
 import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_PARAM_MEMORY_OOM_CONTROL;
 import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_PARAM_MEMORY_USAGE_BYTES;
@@ -75,16 +75,49 @@ public class TestDefaultOOMHandler {
   }
 
   /**
-   * We have two guaranteed containers, both of which are out of limit.
+   * Test an OOM situation where there are no running containers that
+   * can be killed.
+   */
+  @Test(expected = YarnRuntimeException.class)
+  public void testExceptionThrownWithNoRunningContainersToKill()
+      throws Exception {
+    ConcurrentHashMap<ContainerId, Container> containers =
+        new ConcurrentHashMap<>();
+    Container c1 = createContainer(1, true, 1L, false);
+    containers.put(c1.getContainerId(), c1);
+
+    Context context = mock(Context.class);
+    when(context.getContainers()).thenReturn(containers);
+
+    CGroupsHandler cGroupsHandler = mock(CGroupsHandler.class);
+    when(cGroupsHandler.getCGroupParam(
+        CGroupsHandler.CGroupController.MEMORY,
+        "",
+        CGROUP_PARAM_MEMORY_OOM_CONTROL))
+        .thenReturn("under_oom 1").thenReturn("under_oom 0");
+
+    DefaultOOMHandler handler = new DefaultOOMHandler(context, false) {
+      @Override
+      protected CGroupsHandler getCGroupsHandler() {
+        return cGroupsHandler;
+      }
+    };
+
+    handler.run();
+  }
+
+  /**
+   * We have two running guaranteed containers, both of which are out of limit.
    * We should kill the later one.
    */
   @Test
-  public void testBothGuaranteedContainersOverLimitUponOOM() throws Exception {
+  public void testBothRunningGuaranteedContainersOverLimitUponOOM()
+      throws Exception {
     ConcurrentHashMap<ContainerId, Container> containers =
         new ConcurrentHashMap<>();
-    Container c1 = createContainer(1, true, 1L);
+    Container c1 = createContainer(1, true, 1L, true);
     containers.put(c1.getContainerId(), c1);
-    Container c2 = createContainer(2, true, 2L);
+    Container c2 = createContainer(2, true, 2L, true);
     containers.put(c2.getContainerId(), c2);
 
     ContainerExecutor ex = createContainerExecutor(containers);
@@ -100,7 +133,7 @@ public class TestDefaultOOMHandler {
         CGROUP_PARAM_MEMORY_OOM_CONTROL))
         .thenReturn("under_oom 1").thenReturn("under_oom 0");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
-        c1.getContainerId().toString(), CGROUP_FILE_TASKS))
+        c1.getContainerId().toString(), CGROUP_PROCS_FILE))
         .thenReturn("1234").thenReturn("");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
         c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
@@ -109,7 +142,7 @@ public class TestDefaultOOMHandler {
         c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
         .thenReturn(getMB(11));
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
-        c2.getContainerId().toString(), CGROUP_FILE_TASKS))
+        c2.getContainerId().toString(), CGROUP_PROCS_FILE))
         .thenReturn("1235").thenReturn("");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
         c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
@@ -139,7 +172,7 @@ public class TestDefaultOOMHandler {
   }
 
   /**
-   * We have two GUARANTEED containers, one of which is out of limit.
+   * We have two running GUARANTEED containers, one of which is out of limit.
    * We should kill the one that's out of its limit. This should
    * happen even if it was launched earlier than the other one.
    */
@@ -147,9 +180,9 @@ public class TestDefaultOOMHandler {
   public void testOneGuaranteedContainerOverLimitUponOOM() throws Exception {
     ConcurrentHashMap<ContainerId, Container> containers =
         new ConcurrentHashMap<>();
-    Container c1 = createContainer(1, true, 2L);
+    Container c1 = createContainer(1, true, 2L, true);
     containers.put(c1.getContainerId(), c1);
-    Container c2 = createContainer(2, true, 1L);
+    Container c2 = createContainer(2, true, 1L, true);
     containers.put(c2.getContainerId(), c2);
 
     ContainerExecutor ex = createContainerExecutor(containers);
@@ -164,7 +197,7 @@ public class TestDefaultOOMHandler {
         CGROUP_PARAM_MEMORY_OOM_CONTROL))
         .thenReturn("under_oom 1").thenReturn("under_oom 0");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
-        c1.getContainerId().toString(), CGROUP_FILE_TASKS))
+        c1.getContainerId().toString(), CGROUP_PROCS_FILE))
         .thenReturn("1234").thenReturn("");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
         c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
@@ -175,7 +208,7 @@ public class TestDefaultOOMHandler {
 
     // container c2 is out of its limit
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
-        c2.getContainerId().toString(), CGROUP_FILE_TASKS))
+        c2.getContainerId().toString(), CGROUP_PROCS_FILE))
         .thenReturn("1235").thenReturn("");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
         c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
@@ -204,16 +237,16 @@ public class TestDefaultOOMHandler {
   }
 
   /**
-   * We have two GUARANTEE containers, neither of which is out of limit.
+   * We have two running GUARANTEE containers, neither of which is out of limit.
    * We should kill the later launched one.
    */
   @Test
   public void testNoGuaranteedContainerOverLimitOOM() throws Exception {
     ConcurrentHashMap<ContainerId, Container> containers =
         new ConcurrentHashMap<>();
-    Container c1 = createContainer(1, true, 1L);
+    Container c1 = createContainer(1, true, 1L, true);
     containers.put(c1.getContainerId(), c1);
-    Container c2 = createContainer(2, true, 2L);
+    Container c2 = createContainer(2, true, 2L, true);
     containers.put(c2.getContainerId(), c2);
 
     ContainerExecutor ex = createContainerExecutor(containers);
@@ -228,7 +261,7 @@ public class TestDefaultOOMHandler {
         CGROUP_PARAM_MEMORY_OOM_CONTROL))
         .thenReturn("under_oom 1").thenReturn("under_oom 0");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
-        c1.getContainerId().toString(), CGROUP_FILE_TASKS))
+        c1.getContainerId().toString(), CGROUP_PROCS_FILE))
         .thenReturn("1234").thenReturn("");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
         c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
@@ -237,7 +270,7 @@ public class TestDefaultOOMHandler {
         c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
         .thenReturn(getMB(9));
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
-        c2.getContainerId().toString(), CGROUP_FILE_TASKS))
+        c2.getContainerId().toString(), CGROUP_PROCS_FILE))
         .thenReturn("1235").thenReturn("");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
         c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
@@ -266,17 +299,250 @@ public class TestDefaultOOMHandler {
   }
 
   /**
-   * We have two opportunistic containers, both of which are out of limit.
-   * We should kill the later one.
+   * We have two OPPORTUNISTIC containers, one running and the other not.
+   * We should kill the running one.
+   */
+  @Test
+  public void testKillOnlyRunningContainersUponOOM() throws Exception {
+    ConcurrentHashMap<ContainerId, Container> containers =
+        new ConcurrentHashMap<>();
+    Container c1 = createContainer(1, false, 1L, false);
+    containers.put(c1.getContainerId(), c1);
+    Container c2 = createContainer(2, false, 2L, true);
+    containers.put(c2.getContainerId(), c2);
+
+    ContainerExecutor ex = createContainerExecutor(containers);
+    Context context = mock(Context.class);
+    when(context.getContainers()).thenReturn(containers);
+    when(context.getContainerExecutor()).thenReturn(ex);
+
+    CGroupsHandler cGroupsHandler = mock(CGroupsHandler.class);
+    when(cGroupsHandler.getCGroupParam(
+        CGroupsHandler.CGroupController.MEMORY,
+        "",
+        CGROUP_PARAM_MEMORY_OOM_CONTROL))
+        .thenReturn("under_oom 1").thenReturn("under_oom 0");
+    when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
+        c2.getContainerId().toString(), CGROUP_PROCS_FILE))
+        .thenReturn("1234").thenReturn("");
+    when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
+        c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
+        .thenReturn(getMB(9));
+    when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
+        c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
+        .thenReturn(getMB(9));
+
+    DefaultOOMHandler handler =
+        new DefaultOOMHandler(context, false) {
+          @Override
+          protected CGroupsHandler getCGroupsHandler() {
+            return cGroupsHandler;
+          }
+        };
+    handler.run();
+
+    verify(ex, times(1)).signalContainer(
+        new ContainerSignalContext.Builder()
+            .setPid("1235")
+            .setContainer(c2)
+            .setSignal(ContainerExecutor.Signal.KILL)
+            .build()
+    );
+    verify(ex, times(1)).signalContainer(any());
+  }
+
+
+  /**
+   * We have two 'running' OPPORTUNISTIC containers. Killing the most-
+   * recently launched one fails because its cgroup.procs file is not
+   * available. The other OPPORTUNISTIC containers should be killed in
+   * this case.
+   */
+  @Test
+  public void  testKillOpportunisticContainerWithKillFailuresUponOOM()
+      throws Exception {
+    ConcurrentHashMap<ContainerId, Container> containers =
+        new ConcurrentHashMap<>();
+    Container c1 = createContainer(1, false, 1L, true);
+    containers.put(c1.getContainerId(), c1);
+    Container c2 = createContainer(2, false, 2L, true);
+    containers.put(c2.getContainerId(), c2);
+
+    ContainerExecutor ex = createContainerExecutor(containers);
+    Context context = mock(Context.class);
+    when(context.getContainers()).thenReturn(containers);
+    when(context.getContainerExecutor()).thenReturn(ex);
+
+    CGroupsHandler cGroupsHandler = mock(CGroupsHandler.class);
+    when(cGroupsHandler.getCGroupParam(
+        CGroupsHandler.CGroupController.MEMORY,
+        "",
+        CGROUP_PARAM_MEMORY_OOM_CONTROL))
+        .thenReturn("under_oom 1").thenReturn("under_oom 0");
+    when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
+        c1.getContainerId().toString(), CGROUP_PROCS_FILE))
+        .thenReturn("1234").thenReturn("");
+    when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
+        c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
+        .thenReturn(getMB(9));
+    when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
+        c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
+        .thenReturn(getMB(9));
+    // c2 process has not started, hence no cgroup.procs file yet
+    when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
+        c2.getContainerId().toString(), CGROUP_PROCS_FILE))
+        .thenThrow(
+            new ResourceHandlerException(CGROUP_PROCS_FILE + " not found"));
+
+    DefaultOOMHandler handler =
+        new DefaultOOMHandler(context, false) {
+          @Override
+          protected CGroupsHandler getCGroupsHandler() {
+            return cGroupsHandler;
+          }
+        };
+    handler.run();
+
+    verify(ex, times(1)).signalContainer(
+        new ContainerSignalContext.Builder()
+            .setPid("1235")
+            .setContainer(c1)
+            .setSignal(ContainerExecutor.Signal.KILL)
+            .build()
+    );
+    verify(ex, times(1)).signalContainer(any());
+  }
+
+  /**
+   * We have two 'running' OPPORTUNISTIC containers and one GUARANTEED
+   * container. Killing two OPPORTUNISTIC containers fails because they
+   * have not really started running as processes since the root cgroup
+   * is under oom. We should try to kill one container successfully. In
+   * this case, the GUARANTEED container should be killed.
+   */
+  @Test
+  public void testKillGuaranteedContainerWithKillFailuresUponOOM()
+      throws Exception {
+    ConcurrentHashMap<ContainerId, Container> containers =
+        new ConcurrentHashMap<>();
+    Container c1 = createContainer(1, false, 1L, true);
+    containers.put(c1.getContainerId(), c1);
+    Container c2 = createContainer(2, false, 2L, true);
+    containers.put(c2.getContainerId(), c2);
+    Container c3 = createContainer(3, true, 2L, true);
+    containers.put(c3.getContainerId(), c3);
+
+    ContainerExecutor ex = createContainerExecutor(containers);
+    Context context = mock(Context.class);
+    when(context.getContainers()).thenReturn(containers);
+    when(context.getContainerExecutor()).thenReturn(ex);
+
+    CGroupsHandler cGroupsHandler = mock(CGroupsHandler.class);
+    when(cGroupsHandler.getCGroupParam(
+        CGroupsHandler.CGroupController.MEMORY,
+        "",
+        CGROUP_PARAM_MEMORY_OOM_CONTROL))
+        .thenReturn("under_oom 1").thenReturn("under_oom 0");
+    // c1 process has not started, hence no cgroup.procs file yet
+    when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
+        c1.getContainerId().toString(), CGROUP_PROCS_FILE))
+        .thenThrow(
+            new ResourceHandlerException(CGROUP_PROCS_FILE + " not found"));
+    // c2 process has not started, hence no cgroup.procs file yet
+    when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
+        c2.getContainerId().toString(), CGROUP_PROCS_FILE))
+        .thenThrow(
+            new ResourceHandlerException(CGROUP_PROCS_FILE + " not found"));
+    when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
+        c3.getContainerId().toString(), CGROUP_PROCS_FILE))
+        .thenReturn("1234").thenReturn("");
+    when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
+        c3.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
+        .thenReturn(getMB(9));
+    when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
+        c3.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
+        .thenReturn(getMB(9));
+
+    DefaultOOMHandler handler =
+        new DefaultOOMHandler(context, false) {
+          @Override
+          protected CGroupsHandler getCGroupsHandler() {
+            return cGroupsHandler;
+          }
+        };
+    handler.run();
+
+    verify(ex, times(1)).signalContainer(
+        new ContainerSignalContext.Builder()
+            .setPid("1235")
+            .setContainer(c3)
+            .setSignal(ContainerExecutor.Signal.KILL)
+            .build()
+    );
+    verify(ex, times(1)).signalContainer(any());
+  }
+
+  /**
+   * Test an OOM situation where no containers are killed successfully.
+   *
+   * We have two 'running' containers, none of which are actually
+   * running as processes. Their cgroup.procs file is not available,
+   * so kill them won't succeed.
+   */
+  @Test(expected = YarnRuntimeException.class)
+  public void testExceptionThrownWhenNoContainersKilledSuccessfully()
+      throws Exception {
+    ConcurrentHashMap<ContainerId, Container> containers =
+        new ConcurrentHashMap<>();
+    Container c1 = createContainer(1, false, 1L, true);
+    containers.put(c1.getContainerId(), c1);
+    Container c2 = createContainer(2, false, 2L, true);
+    containers.put(c2.getContainerId(), c2);
+
+    ContainerExecutor ex = createContainerExecutor(containers);
+    Context context = mock(Context.class);
+    when(context.getContainers()).thenReturn(containers);
+    when(context.getContainerExecutor()).thenReturn(ex);
+
+    CGroupsHandler cGroupsHandler = mock(CGroupsHandler.class);
+    when(cGroupsHandler.getCGroupParam(
+        CGroupsHandler.CGroupController.MEMORY,
+        "",
+        CGROUP_PARAM_MEMORY_OOM_CONTROL))
+        .thenReturn("under_oom 1").thenReturn("under_oom 0");
+    // c1 process has not started, hence no cgroup.procs file yet
+    when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
+        c1.getContainerId().toString(), CGROUP_PROCS_FILE))
+        .thenThrow(
+            new ResourceHandlerException(CGROUP_PROCS_FILE + " not found"));
+    // c2 process has not started, hence no cgroup.procs file yet
+    when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
+        c2.getContainerId().toString(), CGROUP_PROCS_FILE))
+        .thenThrow(
+            new ResourceHandlerException(CGROUP_PROCS_FILE + " not found"));
+
+    DefaultOOMHandler handler =
+        new DefaultOOMHandler(context, false) {
+          @Override
+          protected CGroupsHandler getCGroupsHandler() {
+            return cGroupsHandler;
+          }
+        };
+    handler.run();
+  }
+
+  /**
+   * We have two running opportunistic containers, both of which are out of
+   * limit. We should kill the later one.
    */
   @Test
   public void testBothOpportunisticContainersOverLimitUponOOM()
       throws Exception {
     ConcurrentHashMap<ContainerId, Container> containers =
         new ConcurrentHashMap<>();
-    Container c1 = createContainer(1, false, 1L);
+    Container c1 = createContainer(1, false, 1L, true);
     containers.put(c1.getContainerId(), c1);
-    Container c2 = createContainer(2, false, 2L);
+    Container c2 = createContainer(2, false, 2L, true);
     containers.put(c2.getContainerId(), c2);
 
     ContainerExecutor ex = createContainerExecutor(containers);
@@ -292,7 +558,7 @@ public class TestDefaultOOMHandler {
         CGROUP_PARAM_MEMORY_OOM_CONTROL))
         .thenReturn("under_oom 1").thenReturn("under_oom 0");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
-        c1.getContainerId().toString(), CGROUP_FILE_TASKS))
+        c1.getContainerId().toString(), CGROUP_PROCS_FILE))
         .thenReturn("1234").thenReturn("");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
         c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
@@ -301,7 +567,7 @@ public class TestDefaultOOMHandler {
         c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
         .thenReturn(getMB(11));
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
-        c2.getContainerId().toString(), CGROUP_FILE_TASKS))
+        c2.getContainerId().toString(), CGROUP_PROCS_FILE))
         .thenReturn("1235").thenReturn("");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
         c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
@@ -331,17 +597,17 @@ public class TestDefaultOOMHandler {
   }
 
   /**
-   * We have two OPPORTUNISTIC containers, one of which is out of limit.
-   * We should kill the one that's out of its limit. This should
+   * We have two running OPPORTUNISTIC containers, one of which is out of
+   * limit. We should kill the one that's out of its limit. This should
    * happen even if it was launched earlier than the other one.
    */
   @Test
   public void testOneOpportunisticContainerOverLimitUponOOM() throws Exception {
     ConcurrentHashMap<ContainerId, Container> containers =
         new ConcurrentHashMap<>();
-    Container c1 = createContainer(1, false, 2L);
+    Container c1 = createContainer(1, false, 2L, true);
     containers.put(c1.getContainerId(), c1);
-    Container c2 = createContainer(2, false, 1L);
+    Container c2 = createContainer(2, false, 1L, true);
     containers.put(c2.getContainerId(), c2);
 
     ContainerExecutor ex = createContainerExecutor(containers);
@@ -356,7 +622,7 @@ public class TestDefaultOOMHandler {
         CGROUP_PARAM_MEMORY_OOM_CONTROL))
         .thenReturn("under_oom 1").thenReturn("under_oom 0");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
-        c1.getContainerId().toString(), CGROUP_FILE_TASKS))
+        c1.getContainerId().toString(), CGROUP_PROCS_FILE))
         .thenReturn("1234").thenReturn("");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
         c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
@@ -366,7 +632,7 @@ public class TestDefaultOOMHandler {
         .thenReturn(getMB(9));
     // contnainer c2 is out of its limit
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
-        c2.getContainerId().toString(), CGROUP_FILE_TASKS))
+        c2.getContainerId().toString(), CGROUP_PROCS_FILE))
         .thenReturn("1235").thenReturn("");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
         c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
@@ -395,16 +661,16 @@ public class TestDefaultOOMHandler {
   }
 
   /**
-   * We have two OPPORTUNISTIC containers, neither of which is out of limit.
-   * We should kill the later one.
+   * We have two running OPPORTUNISTIC containers, neither of which is out of
+   * limit. We should kill the later one.
    */
   @Test
   public void testNoOpportunisticContainerOverLimitOOM() throws Exception {
     ConcurrentHashMap<ContainerId, Container> containers =
         new ConcurrentHashMap<>();
-    Container c1 = createContainer(1, false, 1L);
+    Container c1 = createContainer(1, false, 1L, true);
     containers.put(c1.getContainerId(), c1);
-    Container c2 = createContainer(2, false, 2L);
+    Container c2 = createContainer(2, false, 2L, true);
     containers.put(c2.getContainerId(), c2);
 
     ContainerExecutor ex = createContainerExecutor(containers);
@@ -419,7 +685,7 @@ public class TestDefaultOOMHandler {
         CGROUP_PARAM_MEMORY_OOM_CONTROL))
         .thenReturn("under_oom 1").thenReturn("under_oom 0");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
-        c1.getContainerId().toString(), CGROUP_FILE_TASKS))
+        c1.getContainerId().toString(), CGROUP_PROCS_FILE))
         .thenReturn("1234").thenReturn("");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
         c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
@@ -428,7 +694,7 @@ public class TestDefaultOOMHandler {
         c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
         .thenReturn(getMB(9));
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
-        c2.getContainerId().toString(), CGROUP_FILE_TASKS))
+        c2.getContainerId().toString(), CGROUP_PROCS_FILE))
         .thenReturn("1235").thenReturn("");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
         c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
@@ -457,8 +723,8 @@ public class TestDefaultOOMHandler {
   }
 
   /**
-   * We have two OPPORTUNISTIC containers and one GUARANTEED container.
-   * One of the OPPORTUNISTIC container is out of limit.
+   * We have two running OPPORTUNISTIC containers and one running GUARANTEED
+   * container. One of the OPPORTUNISTIC container is out of limit.
    * OOM is resolved after killing the OPPORTUNISTIC container that
    * exceeded its limit even though it is launched earlier than the
    * other OPPORTUNISTIC container.
@@ -469,11 +735,11 @@ public class TestDefaultOOMHandler {
     ConcurrentHashMap<ContainerId, Container> containers =
         new ConcurrentHashMap<>();
     int currentContainerId = 0;
-    Container c1 = createContainer(currentContainerId++, false, 2);
+    Container c1 = createContainer(currentContainerId++, false, 2, true);
     containers.put(c1.getContainerId(), c1);
-    Container c2 = createContainer(currentContainerId++, false, 1);
+    Container c2 = createContainer(currentContainerId++, false, 1, true);
     containers.put(c2.getContainerId(), c2);
-    Container c3 = createContainer(currentContainerId++, true, 1);
+    Container c3 = createContainer(currentContainerId++, true, 1, true);
     containers.put(c3.getContainerId(), c3);
 
     ContainerExecutor ex = createContainerExecutor(containers);
@@ -489,7 +755,7 @@ public class TestDefaultOOMHandler {
         .thenReturn("under_oom 1")
         .thenReturn("under_oom 0");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
-        c1.getContainerId().toString(), CGROUP_FILE_TASKS))
+        c1.getContainerId().toString(), CGROUP_PROCS_FILE))
         .thenReturn("1234").thenReturn("");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
         c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
@@ -500,7 +766,7 @@ public class TestDefaultOOMHandler {
 
     // container c2 is out of its limit
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
-        c2.getContainerId().toString(), CGROUP_FILE_TASKS))
+        c2.getContainerId().toString(), CGROUP_PROCS_FILE))
         .thenReturn("1235").thenReturn("");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
         c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
@@ -510,7 +776,7 @@ public class TestDefaultOOMHandler {
         .thenReturn(getMB(11));
 
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
-        c3.getContainerId().toString(), CGROUP_FILE_TASKS))
+        c3.getContainerId().toString(), CGROUP_PROCS_FILE))
         .thenReturn("1236").thenReturn("");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
         c3.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
@@ -538,8 +804,8 @@ public class TestDefaultOOMHandler {
     verify(ex, times(1)).signalContainer(any());
   }
   /**
-   * We have two OPPORTUNISTIC containers and one GUARANTEED container.
-   * None of the containers exceeded its memory limit.
+   * We have two running OPPORTUNISTIC containers and one running GUARANTEED
+   * container. None of the containers exceeded its memory limit.
    * OOM is resolved after killing the most recently launched OPPORTUNISTIC
    * container.
    */
@@ -548,11 +814,11 @@ public class TestDefaultOOMHandler {
     ConcurrentHashMap<ContainerId, Container> containers =
         new ConcurrentHashMap<>();
     int currentContainerId = 0;
-    Container c1 = createContainer(currentContainerId++, false, 1);
+    Container c1 = createContainer(currentContainerId++, false, 1, true);
     containers.put(c1.getContainerId(), c1);
-    Container c2 = createContainer(currentContainerId++, false, 2);
+    Container c2 = createContainer(currentContainerId++, false, 2, true);
     containers.put(c2.getContainerId(), c2);
-    Container c3 = createContainer(currentContainerId++, true, 1);
+    Container c3 = createContainer(currentContainerId++, true, 1, true);
     containers.put(c3.getContainerId(), c3);
 
     ContainerExecutor ex = createContainerExecutor(containers);
@@ -568,7 +834,7 @@ public class TestDefaultOOMHandler {
         .thenReturn("under_oom 1")
         .thenReturn("under_oom 0");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
-        c1.getContainerId().toString(), CGROUP_FILE_TASKS))
+        c1.getContainerId().toString(), CGROUP_PROCS_FILE))
         .thenReturn("1234").thenReturn("");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
         c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
@@ -577,7 +843,7 @@ public class TestDefaultOOMHandler {
         c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
         .thenReturn(getMB(9));
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
-        c2.getContainerId().toString(), CGROUP_FILE_TASKS))
+        c2.getContainerId().toString(), CGROUP_PROCS_FILE))
         .thenReturn("1235").thenReturn("");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
         c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
@@ -586,7 +852,7 @@ public class TestDefaultOOMHandler {
         c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
         .thenReturn(getMB(9));
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
-        c3.getContainerId().toString(), CGROUP_FILE_TASKS))
+        c3.getContainerId().toString(), CGROUP_PROCS_FILE))
         .thenReturn("1236").thenReturn("");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
         c3.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
@@ -615,8 +881,8 @@ public class TestDefaultOOMHandler {
   }
 
   /**
-   * We have two OPPORTUNISTIC containers and one GUARANTEED container.
-   * One of the OPPORTUNISTIC container is out of limit.
+   * We have two running OPPORTUNISTIC containers and one running GUARANTEED
+   * container. One of the OPPORTUNISTIC container is out of limit.
    * OOM is resolved after killing both OPPORTUNISTIC containers.
    */
   @Test
@@ -625,11 +891,11 @@ public class TestDefaultOOMHandler {
 
     ConcurrentHashMap<ContainerId, Container> containers =
         new ConcurrentHashMap<>();
-    Container c1 = createContainer(currentContainerId++, false, 2);
+    Container c1 = createContainer(currentContainerId++, false, 2, true);
     containers.put(c1.getContainerId(), c1);
-    Container c2 = createContainer(currentContainerId++, false, 1);
+    Container c2 = createContainer(currentContainerId++, false, 1, true);
     containers.put(c2.getContainerId(), c2);
-    Container c3 = createContainer(currentContainerId++, true, 1);
+    Container c3 = createContainer(currentContainerId++, true, 1, true);
     containers.put(c3.getContainerId(), c3);
 
     ContainerExecutor ex = createContainerExecutor(containers);
@@ -646,7 +912,7 @@ public class TestDefaultOOMHandler {
         .thenReturn("under_oom 1")
         .thenReturn("under_oom 0");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
-        c1.getContainerId().toString(), CGROUP_FILE_TASKS))
+        c1.getContainerId().toString(), CGROUP_PROCS_FILE))
         .thenReturn("1234").thenReturn("");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
         c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
@@ -655,7 +921,7 @@ public class TestDefaultOOMHandler {
         c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
         .thenReturn(getMB(9));
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
-        c2.getContainerId().toString(), CGROUP_FILE_TASKS))
+        c2.getContainerId().toString(), CGROUP_PROCS_FILE))
         .thenReturn("1235").thenReturn("");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
         c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
@@ -664,7 +930,7 @@ public class TestDefaultOOMHandler {
         c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
         .thenReturn(getMB(11));
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
-        c3.getContainerId().toString(), CGROUP_FILE_TASKS))
+        c3.getContainerId().toString(), CGROUP_PROCS_FILE))
         .thenReturn("1236").thenReturn("");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
         c3.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
@@ -701,8 +967,8 @@ public class TestDefaultOOMHandler {
   }
 
   /**
-   * We have two OPPORTUNISTIC containers and one GUARANTEED container.
-   * the GUARANTEED container is out of limit. OOM is resolved
+   * We have two running OPPORTUNISTIC containers and one running GUARANTEED
+   * container. The GUARANTEED container is out of limit. OOM is resolved
    * after first killing the two OPPORTUNISTIC containers and then the
    * GUARANTEED container.
    */
@@ -712,11 +978,11 @@ public class TestDefaultOOMHandler {
 
     ConcurrentHashMap<ContainerId, Container> containers =
         new ConcurrentHashMap<>();
-    Container c1 = createContainer(currentContainerId++, false, 2);
+    Container c1 = createContainer(currentContainerId++, false, 2, true);
     containers.put(c1.getContainerId(), c1);
-    Container c2 = createContainer(currentContainerId++, false, 1);
+    Container c2 = createContainer(currentContainerId++, false, 1, true);
     containers.put(c2.getContainerId(), c2);
-    Container c3 = createContainer(currentContainerId++, true, 1);
+    Container c3 = createContainer(currentContainerId++, true, 1, true);
     containers.put(c3.getContainerId(), c3);
 
     ContainerExecutor ex = createContainerExecutor(containers);
@@ -734,7 +1000,7 @@ public class TestDefaultOOMHandler {
         .thenReturn("under_oom 1")
         .thenReturn("under_oom 0");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
-        c1.getContainerId().toString(), CGROUP_FILE_TASKS))
+        c1.getContainerId().toString(), CGROUP_PROCS_FILE))
         .thenReturn("1234").thenReturn("");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
         c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
@@ -743,7 +1009,7 @@ public class TestDefaultOOMHandler {
         c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
         .thenReturn(getMB(9));
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
-        c2.getContainerId().toString(), CGROUP_FILE_TASKS))
+        c2.getContainerId().toString(), CGROUP_PROCS_FILE))
         .thenReturn("1235").thenReturn("");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
         c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
@@ -752,7 +1018,7 @@ public class TestDefaultOOMHandler {
         c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
         .thenReturn(getMB(9));
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
-        c3.getContainerId().toString(), CGROUP_FILE_TASKS))
+        c3.getContainerId().toString(), CGROUP_PROCS_FILE))
         .thenReturn("1236").thenReturn("");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
         c3.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
@@ -795,8 +1061,8 @@ public class TestDefaultOOMHandler {
   }
 
   /**
-   * We have two OPPORTUNISTIC containers and one GUARANTEED container.
-   * None of the containers exceeded its memory limit.
+   * We have two running OPPORTUNISTIC containers and one running GUARANTEED
+   * container. None of the containers exceeded its memory limit.
    * OOM is resolved after killing all running containers.
    */
   @Test
@@ -805,11 +1071,11 @@ public class TestDefaultOOMHandler {
 
     ConcurrentHashMap<ContainerId, Container> containers =
         new ConcurrentHashMap<>();
-    Container c1 = createContainer(currentContainerId++, false, 1);
+    Container c1 = createContainer(currentContainerId++, false, 1, true);
     containers.put(c1.getContainerId(), c1);
-    Container c2 = createContainer(currentContainerId++, false, 2);
+    Container c2 = createContainer(currentContainerId++, false, 2, true);
     containers.put(c2.getContainerId(), c2);
-    Container c3 = createContainer(currentContainerId++, true, 1);
+    Container c3 = createContainer(currentContainerId++, true, 1, true);
     containers.put(c3.getContainerId(), c3);
 
     ContainerExecutor ex = createContainerExecutor(containers);
@@ -827,7 +1093,7 @@ public class TestDefaultOOMHandler {
         .thenReturn("under_oom 1")
         .thenReturn("under_oom 0");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
-        c1.getContainerId().toString(), CGROUP_FILE_TASKS))
+        c1.getContainerId().toString(), CGROUP_PROCS_FILE))
         .thenReturn("1234").thenReturn("");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
         c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
@@ -836,7 +1102,7 @@ public class TestDefaultOOMHandler {
         c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
         .thenReturn(getMB(9));
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
-        c2.getContainerId().toString(), CGROUP_FILE_TASKS))
+        c2.getContainerId().toString(), CGROUP_PROCS_FILE))
         .thenReturn("1235").thenReturn("");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
         c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
@@ -845,7 +1111,7 @@ public class TestDefaultOOMHandler {
         c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
         .thenReturn(getMB(9));
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
-        c3.getContainerId().toString(), CGROUP_FILE_TASKS))
+        c3.getContainerId().toString(), CGROUP_PROCS_FILE))
         .thenReturn("1236").thenReturn("");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
         c3.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
@@ -888,7 +1154,8 @@ public class TestDefaultOOMHandler {
   }
 
   /**
-   * We have two OPPORTUNISTIC containers and one GUARANTEED container.
+   * We have two running OPPORTUNISTIC containers and one running
+   * GUARANTEED container.
    * None of the containers exceeded its memory limit.
    * OOM is not resolved even after killing all running containers.
    * A YarnRuntimeException is excepted to be thrown.
@@ -899,11 +1166,11 @@ public class TestDefaultOOMHandler {
 
     ConcurrentHashMap<ContainerId, Container> containers =
         new ConcurrentHashMap<>();
-    Container c1 = createContainer(currentContainerId++, false, 1);
+    Container c1 = createContainer(currentContainerId++, false, 1, true);
     containers.put(c1.getContainerId(), c1);
-    Container c2 = createContainer(currentContainerId++, false, 2);
+    Container c2 = createContainer(currentContainerId++, false, 2, true);
     containers.put(c2.getContainerId(), c2);
-    Container c3 = createContainer(currentContainerId++, true, 3);
+    Container c3 = createContainer(currentContainerId++, true, 3, true);
     containers.put(c3.getContainerId(), c3);
 
     ContainerExecutor ex = createContainerExecutor(containers);
@@ -921,7 +1188,7 @@ public class TestDefaultOOMHandler {
         .thenReturn("under_oom 1")
         .thenReturn("under_oom 1");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
-        c1.getContainerId().toString(), CGROUP_FILE_TASKS))
+        c1.getContainerId().toString(), CGROUP_PROCS_FILE))
         .thenReturn("1234").thenReturn("");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
         c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
@@ -930,7 +1197,7 @@ public class TestDefaultOOMHandler {
         c1.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
         .thenReturn(getMB(9));
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
-        c2.getContainerId().toString(), CGROUP_FILE_TASKS))
+        c2.getContainerId().toString(), CGROUP_PROCS_FILE))
         .thenReturn("1235").thenReturn("");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
         c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
@@ -939,7 +1206,7 @@ public class TestDefaultOOMHandler {
         c2.getContainerId().toString(), CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES))
         .thenReturn(getMB(9));
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
-        c3.getContainerId().toString(), CGROUP_FILE_TASKS))
+        c3.getContainerId().toString(), CGROUP_PROCS_FILE))
         .thenReturn("1236").thenReturn("");
     when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY,
         c3.getContainerId().toString(), CGROUP_PARAM_MEMORY_USAGE_BYTES))
@@ -974,7 +1241,7 @@ public class TestDefaultOOMHandler {
   }
 
   private static Container createContainer(int containerId,
-      boolean guaranteed, long launchTime) {
+      boolean guaranteed, long launchTime, boolean running) {
     Container c1 = mock(Container.class);
     ContainerId cid1 = createContainerId(containerId);
     when(c1.getContainerId()).thenReturn(cid1);
@@ -987,6 +1254,7 @@ public class TestDefaultOOMHandler {
 
     when(c1.getResource()).thenReturn(Resource.newInstance(10, 1));
     when(c1.getContainerLaunchTime()).thenReturn(launchTime);
+    when(c1.isRunning()).thenReturn(running);
 
     return c1;
   }


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org


Mime
View raw message