hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From x...@apache.org
Subject [04/45] hadoop git commit: YARN-8423. GPU does not get released even though the application gets killed. (Sunil G via wangda)
Date Mon, 02 Jul 2018 20:32:21 GMT
YARN-8423. GPU does not get released even though the application gets killed. (Sunil G via
wangda)

Change-Id: I570db7d60f8c6c21762dd618a9207d1107c486a0


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/ada8f63d
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/ada8f63d
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/ada8f63d

Branch: refs/heads/HDDS-4
Commit: ada8f63d0b3739d245300461387b0516dc92ccf9
Parents: 62d83ca
Author: Wangda Tan <wangda@apache.org>
Authored: Tue Jun 26 19:25:57 2018 -0700
Committer: Wangda Tan <wangda@apache.org>
Committed: Tue Jun 26 19:25:57 2018 -0700

----------------------------------------------------------------------
 .../containermanager/container/Container.java   |  6 ++
 .../container/ContainerImpl.java                | 11 ++++
 .../resources/gpu/GpuResourceAllocator.java     | 68 +++++++++++++++++++-
 .../resources/gpu/GpuResourceHandlerImpl.java   |  1 -
 .../nodemanager/webapp/MockContainer.java       |  3 +
 5 files changed, 85 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/ada8f63d/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java
index 5d48d84..4912d02 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java
@@ -113,4 +113,10 @@ public interface Container extends EventHandler<ContainerEvent>
{
   ResourceMappings getResourceMappings();
 
   void sendPauseEvent(String description);
+
+  /**
+   * Verify container is in final states.
+   * @return true/false based on container's state
+   */
+  boolean isContainerInFinalStates();
 }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/ada8f63d/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java
index 0541544..f76e682 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java
@@ -2223,4 +2223,15 @@ public class ContainerImpl implements Container {
   SlidingWindowRetryPolicy getRetryPolicy() {
     return retryPolicy;
   }
+
+  @Override
+  public boolean isContainerInFinalStates() {
+    ContainerState state = getContainerState();
+    return state == ContainerState.KILLING || state == ContainerState.DONE
+        || state == ContainerState.LOCALIZATION_FAILED
+        || state == ContainerState.CONTAINER_RESOURCES_CLEANINGUP
+        || state == ContainerState.CONTAINER_CLEANEDUP_AFTER_KILL
+        || state == ContainerState.EXITED_WITH_FAILURE
+        || state == ContainerState.EXITED_WITH_SUCCESS;
+  }
 }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/ada8f63d/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java
index 5bdffc3..81a9655 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java
@@ -26,6 +26,7 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.yarn.api.records.ContainerId;
 import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.api.records.ResourceInformation;
 import org.apache.hadoop.yarn.exceptions.ResourceNotFoundException;
 import org.apache.hadoop.yarn.server.nodemanager.Context;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
@@ -36,10 +37,8 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin
 import java.io.IOException;
 import java.io.Serializable;
 import java.util.ArrayList;
-import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
-import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
@@ -54,6 +53,7 @@ import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI;
  */
 public class GpuResourceAllocator {
   final static Log LOG = LogFactory.getLog(GpuResourceAllocator.class);
+  private static final int WAIT_MS_PER_LOOP = 1000;
 
   private Set<GpuDevice> allowedGpuDevices = new TreeSet<>();
   private Map<GpuDevice, ContainerId> usedDevices = new TreeMap<>();
@@ -168,7 +168,44 @@ public class GpuResourceAllocator {
    * @return allocation results.
    * @throws ResourceHandlerException When failed to assign GPUs.
    */
-  public synchronized GpuAllocation assignGpus(Container container)
+  public GpuAllocation assignGpus(Container container)
+      throws ResourceHandlerException {
+    GpuAllocation allocation = internalAssignGpus(container);
+
+    // Wait for a maximum of 120 seconds if no available GPU are there which
+    // are yet to be released.
+    final int timeoutMsecs = 120 * WAIT_MS_PER_LOOP;
+    int timeWaiting = 0;
+    while (allocation == null) {
+      if (timeWaiting >= timeoutMsecs) {
+        break;
+      }
+
+      // Sleep for 1 sec to ensure there are some free GPU devices which are
+      // getting released.
+      try {
+        LOG.info("Container : " + container.getContainerId()
+            + " is waiting for free GPU devices.");
+        Thread.sleep(WAIT_MS_PER_LOOP);
+        timeWaiting += WAIT_MS_PER_LOOP;
+        allocation = internalAssignGpus(container);
+      } catch (InterruptedException e) {
+        // On any interrupt, break the loop and continue execution.
+        break;
+      }
+    }
+
+    if(allocation == null) {
+      String message = "Could not get valid GPU device for container '" +
+          container.getContainerId()
+          + "' as some other containers might not releasing GPUs.";
+      LOG.warn(message);
+      throw new ResourceHandlerException(message);
+    }
+    return allocation;
+  }
+
+  private synchronized GpuAllocation internalAssignGpus(Container container)
       throws ResourceHandlerException {
     Resource requestedResource = container.getResource();
     ContainerId containerId = container.getContainerId();
@@ -176,6 +213,14 @@ public class GpuResourceAllocator {
     // Assign Gpus to container if requested some.
     if (numRequestedGpuDevices > 0) {
       if (numRequestedGpuDevices > getAvailableGpus()) {
+        // If there are some devices which are getting released, wait for few
+        // seconds to get it.
+        if (numRequestedGpuDevices <= getReleasingGpus() + getAvailableGpus()) {
+          return null;
+        }
+      }
+
+      if (numRequestedGpuDevices > getAvailableGpus()) {
         throw new ResourceHandlerException(
             getResourceHandlerExceptionMessage(numRequestedGpuDevices,
                 containerId));
@@ -211,6 +256,23 @@ public class GpuResourceAllocator {
     return new GpuAllocation(null, allowedGpuDevices);
   }
 
+  private synchronized long getReleasingGpus() {
+    long releasingGpus = 0;
+    Iterator<Map.Entry<GpuDevice, ContainerId>> iter = usedDevices.entrySet()
+        .iterator();
+    while (iter.hasNext()) {
+      ContainerId containerId = iter.next().getValue();
+      Container container;
+      if ((container = nmContext.getContainers().get(containerId)) != null) {
+        if (container.isContainerInFinalStates()) {
+          releasingGpus = releasingGpus + container.getResource()
+              .getResourceInformation(ResourceInformation.GPU_URI).getValue();
+        }
+      }
+    }
+    return releasingGpus;
+  }
+
   /**
    * Clean up all Gpus assigned to containerId
    * @param containerId containerId

http://git-wip-us.apache.org/repos/asf/hadoop/blob/ada8f63d/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java
index 587fcb4..1184382 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java
@@ -18,7 +18,6 @@
 
 package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu;
 
-import com.google.common.annotations.VisibleForTesting;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;

http://git-wip-us.apache.org/repos/asf/hadoop/blob/ada8f63d/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java
index 325709b..67dfef2 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java
@@ -255,4 +255,7 @@ public class MockContainer implements Container {
   public void sendPauseEvent(String description) {
 
   }
+  @Override public boolean isContainerInFinalStates() {
+    return false;
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org


Mime
View raw message