hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From x...@apache.org
Subject [03/45] hadoop git commit: YARN-8461. Support strict memory control on individual container with elastic control memory mechanism. Contributed by Haibo Chen.
Date Mon, 02 Jul 2018 20:32:20 GMT
YARN-8461. Support strict memory control on individual container with elastic control memory
mechanism. Contributed by Haibo Chen.


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/62d83ca5
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/62d83ca5
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/62d83ca5

Branch: refs/heads/HDDS-4
Commit: 62d83ca5360cf803ecf6780caf192462d0092009
Parents: b69ba0f
Author: Miklos Szegedi <miklos.szegedi@cloudera.com>
Authored: Tue Jun 26 15:21:35 2018 -0700
Committer: Miklos Szegedi <miklos.szegedi@cloudera.com>
Committed: Tue Jun 26 15:21:35 2018 -0700

----------------------------------------------------------------------
 .../CGroupsMemoryResourceHandlerImpl.java       |  24 +++++
 .../linux/resources/MemoryResourceHandler.java  |  10 ++
 .../monitor/ContainersMonitorImpl.java          | 108 +++++++++++--------
 .../TestCGroupsMemoryResourceHandlerImpl.java   |  43 ++++++++
 4 files changed, 142 insertions(+), 43 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/62d83ca5/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsMemoryResourceHandlerImpl.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsMemoryResourceHandlerImpl.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsMemoryResourceHandlerImpl.java
index a57adb1..053b796 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsMemoryResourceHandlerImpl.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsMemoryResourceHandlerImpl.java
@@ -34,6 +34,9 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileg
 import java.io.File;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Optional;
+
+import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_PARAM_MEMORY_OOM_CONTROL;
 
 /**
  * Handler class to handle the memory controller. YARN already ships a
@@ -172,4 +175,25 @@ public class CGroupsMemoryResourceHandlerImpl implements MemoryResourceHandler
{
     return null;
   }
 
+  @Override
+  public Optional<Boolean> isUnderOOM(ContainerId containerId) {
+    try {
+      String status = cGroupsHandler.getCGroupParam(
+          CGroupsHandler.CGroupController.MEMORY,
+          containerId.toString(),
+          CGROUP_PARAM_MEMORY_OOM_CONTROL);
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("cgroups OOM status for " + containerId + ": " + status);
+      }
+      if (status.contains(CGroupsHandler.UNDER_OOM)) {
+        LOG.warn("Container " + containerId + " under OOM based on cgroups.");
+        return Optional.of(true);
+      } else {
+        return Optional.of(false);
+      }
+    } catch (ResourceHandlerException e) {
+      LOG.warn("Could not read cgroups" + containerId, e);
+    }
+    return Optional.empty();
+  }
 }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/62d83ca5/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/MemoryResourceHandler.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/MemoryResourceHandler.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/MemoryResourceHandler.java
index 013a49f..1729fc1 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/MemoryResourceHandler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/MemoryResourceHandler.java
@@ -20,8 +20,18 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resourc
 
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.yarn.api.records.ContainerId;
+
+import java.util.Optional;
 
 @InterfaceAudience.Private
 @InterfaceStability.Unstable
 public interface MemoryResourceHandler extends ResourceHandler {
+  /**
+   * check whether a container is under OOM.
+   * @param containerId the id of the container
+   * @return empty if the status is unknown, true is the container is under oom,
+   *         false otherwise
+   */
+  Optional<Boolean> isUnderOOM(ContainerId containerId);
 }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/62d83ca5/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java
index bd68dfe..d83fe39 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java
@@ -22,6 +22,7 @@ import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
 import org.apache.hadoop.yarn.exceptions.YarnException;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupElasticMemoryController;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.MemoryResourceHandler;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerModule;
 import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
 import org.slf4j.Logger;
@@ -51,6 +52,7 @@ import org.apache.hadoop.yarn.util.ResourceCalculatorProcessTree;
 import java.util.Arrays;
 import java.util.Map;
 import java.util.Map.Entry;
+import java.util.Optional;
 import java.util.concurrent.ConcurrentHashMap;
 
 /**
@@ -697,55 +699,75 @@ public class ContainersMonitorImpl extends AbstractService implements
                             ProcessTreeInfo ptInfo,
                             long currentVmemUsage,
                             long currentPmemUsage) {
-      if (elasticMemoryEnforcement || strictMemoryEnforcement) {
-        // We enforce the overall memory usage instead of individual containers
-        return;
-      }
-      boolean isMemoryOverLimit = false;
-      long vmemLimit = ptInfo.getVmemLimit();
-      long pmemLimit = ptInfo.getPmemLimit();
-      // as processes begin with an age 1, we want to see if there
-      // are processes more than 1 iteration old.
-      long curMemUsageOfAgedProcesses = pTree.getVirtualMemorySize(1);
-      long curRssMemUsageOfAgedProcesses = pTree.getRssMemorySize(1);
+      Optional<Boolean> isMemoryOverLimit = Optional.empty();
       String msg = "";
       int containerExitStatus = ContainerExitStatus.INVALID;
-      if (isVmemCheckEnabled()
-              && isProcessTreeOverLimit(containerId.toString(),
-              currentVmemUsage, curMemUsageOfAgedProcesses, vmemLimit)) {
-        // The current usage (age=0) is always higher than the aged usage. We
-        // do not show the aged size in the message, base the delta on the
-        // current usage
-        long delta = currentVmemUsage - vmemLimit;
-        // Container (the root process) is still alive and overflowing
-        // memory.
-        // Dump the process-tree and then clean it up.
-        msg = formatErrorMessage("virtual",
-                formatUsageString(currentVmemUsage, vmemLimit,
+
+      if (strictMemoryEnforcement && elasticMemoryEnforcement) {
+        // Both elastic memory control and strict memory control are enabled
+        // through cgroups. A container will be frozen by the elastic memory
+        // control mechanism if it exceeds its request, so we check for this
+        // here and kill it. Otherwise, the container will not be killed if
+        // the node never exceeds its limit and the procfs-based
+        // memory accounting is different from the cgroup-based accounting.
+
+        MemoryResourceHandler handler =
+            ResourceHandlerModule.getMemoryResourceHandler();
+        if (handler != null) {
+          isMemoryOverLimit = handler.isUnderOOM(containerId);
+          containerExitStatus = ContainerExitStatus.KILLED_EXCEEDED_PMEM;
+          msg = containerId + " is under oom because it exceeded its" +
+              " physical memory limit";
+        }
+      } else if (strictMemoryEnforcement || elasticMemoryEnforcement) {
+        // if cgroup-based memory control is enabled
+        isMemoryOverLimit = Optional.of(false);
+      }
+
+      if (!isMemoryOverLimit.isPresent()) {
+        long vmemLimit = ptInfo.getVmemLimit();
+        long pmemLimit = ptInfo.getPmemLimit();
+        // as processes begin with an age 1, we want to see if there
+        // are processes more than 1 iteration old.
+        long curMemUsageOfAgedProcesses = pTree.getVirtualMemorySize(1);
+        long curRssMemUsageOfAgedProcesses = pTree.getRssMemorySize(1);
+        if (isVmemCheckEnabled()
+            && isProcessTreeOverLimit(containerId.toString(),
+            currentVmemUsage, curMemUsageOfAgedProcesses, vmemLimit)) {
+          // The current usage (age=0) is always higher than the aged usage. We
+          // do not show the aged size in the message, base the delta on the
+          // current usage
+          long delta = currentVmemUsage - vmemLimit;
+          // Container (the root process) is still alive and overflowing
+          // memory.
+          // Dump the process-tree and then clean it up.
+          msg = formatErrorMessage("virtual",
+              formatUsageString(currentVmemUsage, vmemLimit,
                   currentPmemUsage, pmemLimit),
-                pId, containerId, pTree, delta);
-        isMemoryOverLimit = true;
-        containerExitStatus = ContainerExitStatus.KILLED_EXCEEDED_VMEM;
-      } else if (isPmemCheckEnabled()
-              && isProcessTreeOverLimit(containerId.toString(),
-              currentPmemUsage, curRssMemUsageOfAgedProcesses,
-              pmemLimit)) {
-        // The current usage (age=0) is always higher than the aged usage. We
-        // do not show the aged size in the message, base the delta on the
-        // current usage
-        long delta = currentPmemUsage - pmemLimit;
-        // Container (the root process) is still alive and overflowing
-        // memory.
-        // Dump the process-tree and then clean it up.
-        msg = formatErrorMessage("physical",
-                formatUsageString(currentVmemUsage, vmemLimit,
+              pId, containerId, pTree, delta);
+          isMemoryOverLimit = Optional.of(true);
+          containerExitStatus = ContainerExitStatus.KILLED_EXCEEDED_VMEM;
+        } else if (isPmemCheckEnabled()
+            && isProcessTreeOverLimit(containerId.toString(),
+            currentPmemUsage, curRssMemUsageOfAgedProcesses,
+            pmemLimit)) {
+          // The current usage (age=0) is always higher than the aged usage. We
+          // do not show the aged size in the message, base the delta on the
+          // current usage
+          long delta = currentPmemUsage - pmemLimit;
+          // Container (the root process) is still alive and overflowing
+          // memory.
+          // Dump the process-tree and then clean it up.
+          msg = formatErrorMessage("physical",
+              formatUsageString(currentVmemUsage, vmemLimit,
                   currentPmemUsage, pmemLimit),
-                pId, containerId, pTree, delta);
-        isMemoryOverLimit = true;
-        containerExitStatus = ContainerExitStatus.KILLED_EXCEEDED_PMEM;
+              pId, containerId, pTree, delta);
+          isMemoryOverLimit = Optional.of(true);
+          containerExitStatus = ContainerExitStatus.KILLED_EXCEEDED_PMEM;
+        }
       }
 
-      if (isMemoryOverLimit) {
+      if (isMemoryOverLimit.isPresent() && isMemoryOverLimit.get()) {
         // Virtual or physical memory over limit. Fail the container and
         // remove
         // the corresponding process tree

http://git-wip-us.apache.org/repos/asf/hadoop/blob/62d83ca5/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestCGroupsMemoryResourceHandlerImpl.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestCGroupsMemoryResourceHandlerImpl.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestCGroupsMemoryResourceHandlerImpl.java
index 5c7e233..4d3e7e6 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestCGroupsMemoryResourceHandlerImpl.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestCGroupsMemoryResourceHandlerImpl.java
@@ -31,7 +31,9 @@ import org.junit.Test;
 import org.junit.Assert;
 
 import java.util.List;
+import java.util.Optional;
 
+import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_PARAM_MEMORY_OOM_CONTROL;
 import static org.mockito.Mockito.*;
 
 /**
@@ -242,4 +244,45 @@ public class TestCGroupsMemoryResourceHandlerImpl {
         .updateCGroupParam(CGroupsHandler.CGroupController.MEMORY, id,
             CGroupsHandler.CGROUP_PARAM_MEMORY_HARD_LIMIT_BYTES, "1024M");
   }
+
+  @Test
+  public void testContainerUnderOom() throws Exception {
+    Configuration conf = new YarnConfiguration();
+    conf.setBoolean(YarnConfiguration.NM_PMEM_CHECK_ENABLED, false);
+    conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, false);
+
+    cGroupsMemoryResourceHandler.bootstrap(conf);
+
+    ContainerId containerId = mock(ContainerId.class);
+    when(containerId.toString()).thenReturn("container_01_01");
+
+    when(mockCGroupsHandler.getCGroupParam(
+        CGroupsHandler.CGroupController.MEMORY,
+        containerId.toString(),
+        CGROUP_PARAM_MEMORY_OOM_CONTROL)).thenReturn(CGroupsHandler.UNDER_OOM);
+    Optional<Boolean> outOfOom =
+        cGroupsMemoryResourceHandler.isUnderOOM(containerId);
+    Assert.assertTrue("The container should be reported to run under oom",
+        outOfOom.isPresent() && outOfOom.get().equals(true));
+
+    when(mockCGroupsHandler.getCGroupParam(
+        CGroupsHandler.CGroupController.MEMORY,
+        containerId.toString(),
+        CGROUP_PARAM_MEMORY_OOM_CONTROL)).thenReturn("");
+    outOfOom = cGroupsMemoryResourceHandler.isUnderOOM(containerId);
+    Assert.assertTrue(
+        "The container should not be reported to run under oom",
+        outOfOom.isPresent() && outOfOom.get().equals(false));
+
+    when(mockCGroupsHandler.getCGroupParam(
+        CGroupsHandler.CGroupController.MEMORY,
+        containerId.toString(),
+        CGROUP_PARAM_MEMORY_OOM_CONTROL)).
+        thenThrow(new ResourceHandlerException());
+    outOfOom = cGroupsMemoryResourceHandler.isUnderOOM(containerId);
+    Assert.assertFalse(
+        "No report of the oom status should be available.",
+        outOfOom.isPresent());
+
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org


Mime
View raw message