hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jh...@apache.org
Subject [hadoop] 04/09: Partial commit of YARN-7224 to get GpuDevice class and changes
Date Thu, 03 Jan 2019 21:24:22 GMT
This is an automated email from the ASF dual-hosted git repository.

jhung pushed a commit to branch YARN-8200.branch3
in repository https://gitbox.apache.org/repos/asf/hadoop.git

commit 1efe9580427e735bf9d2582d492ae0818f205b9e
Author: Jonathan Hung <jhung@linkedin.com>
AuthorDate: Thu Apr 12 13:56:16 2018 -0700

    Partial commit of YARN-7224 to get GpuDevice class and changes
---
 .../container/ResourceMappings.java                |   2 +-
 .../linux/resources/gpu/GpuResourceAllocator.java  | 102 ++++++-------
 .../resources/gpu/GpuResourceHandlerImpl.java      |  26 ++--
 .../resourceplugin/gpu/GpuDevice.java              |  78 ++++++++++
 .../resourceplugin/gpu/GpuDiscoverer.java          |  30 ++--
 .../gpu/GpuNodeResourceUpdateHandler.java          |  10 +-
 .../recovery/NMLeveldbStateStoreService.java       |  67 ++++----
 .../recovery/NMNullStateStoreService.java          |   5 +-
 .../nodemanager/recovery/NMStateStoreService.java  |  17 ++-
 .../TestContainerManagerRecovery.java              |  15 +-
 .../resources/gpu/TestGpuResourceHandler.java      | 168 ++++++++++++++-------
 .../resourceplugin/gpu/TestGpuDiscoverer.java      |  34 ++++-
 .../recovery/NMMemoryStateStoreService.java        |   7 +-
 .../recovery/TestNMLeveldbStateStoreService.java   |  28 +++-
 14 files changed, 396 insertions(+), 193 deletions(-)

diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ResourceMappings.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ResourceMappings.java
index d673341..225192b 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ResourceMappings.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ResourceMappings.java
@@ -84,7 +84,7 @@ public class ResourceMappings {
       return Collections.unmodifiableList(resources);
     }
 
-    public void updateAssignedResources(List<Serializable> list) {
+    public void updateAssignedResources(List<? extends Serializable> list) {
       this.resources = new ArrayList<>(list);
     }
 
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java
index d6bae09..f2bb342 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java
@@ -26,12 +26,11 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.yarn.api.records.ContainerId;
 import org.apache.hadoop.yarn.api.records.Resource;
-import org.apache.hadoop.yarn.api.records.ResourceInformation;
 import org.apache.hadoop.yarn.exceptions.ResourceNotFoundException;
 import org.apache.hadoop.yarn.server.nodemanager.Context;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
-import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice;
 
 import java.io.IOException;
 import java.io.Serializable;
@@ -54,8 +53,8 @@ import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI;
 public class GpuResourceAllocator {
   final static Log LOG = LogFactory.getLog(GpuResourceAllocator.class);
 
-  private Set<Integer> allowedGpuDevices = new TreeSet<>();
-  private Map<Integer, ContainerId> usedDevices = new TreeMap<>();
+  private Set<GpuDevice> allowedGpuDevices = new TreeSet<>();
+  private Map<GpuDevice, ContainerId> usedDevices = new TreeMap<>();
   private Context nmContext;
 
   public GpuResourceAllocator(Context ctx) {
@@ -63,14 +62,14 @@ public class GpuResourceAllocator {
   }
 
   /**
-   * Contains allowed and denied devices with minor number.
+   * Contains allowed and denied devices
    * Denied devices will be useful for cgroups devices module to do blacklisting
    */
   static class GpuAllocation {
-    private Set<Integer> allowed = Collections.emptySet();
-    private Set<Integer> denied = Collections.emptySet();
+    private Set<GpuDevice> allowed = Collections.emptySet();
+    private Set<GpuDevice> denied = Collections.emptySet();
 
-    GpuAllocation(Set<Integer> allowed, Set<Integer> denied) {
+    GpuAllocation(Set<GpuDevice> allowed, Set<GpuDevice> denied) {
       if (allowed != null) {
         this.allowed = ImmutableSet.copyOf(allowed);
       }
@@ -79,21 +78,21 @@ public class GpuResourceAllocator {
       }
     }
 
-    public Set<Integer> getAllowedGPUs() {
+    public Set<GpuDevice> getAllowedGPUs() {
       return allowed;
     }
 
-    public Set<Integer> getDeniedGPUs() {
+    public Set<GpuDevice> getDeniedGPUs() {
       return denied;
     }
   }
 
   /**
    * Add GPU to allowed list
-   * @param minorNumber minor number of the GPU device.
+   * @param gpuDevice gpu device
    */
-  public synchronized void addGpu(int minorNumber) {
-    allowedGpuDevices.add(minorNumber);
+  public synchronized void addGpu(GpuDevice gpuDevice) {
+    allowedGpuDevices.add(gpuDevice);
   }
 
   private String getResourceHandlerExceptionMessage(int numRequestedGpuDevices,
@@ -117,42 +116,42 @@ public class GpuResourceAllocator {
               + containerId);
     }
 
-    for (Serializable deviceId : c.getResourceMappings().getAssignedResources(
-        GPU_URI)){
-      if (!(deviceId instanceof String)) {
+    for (Serializable gpuDeviceSerializable : c.getResourceMappings()
+        .getAssignedResources(GPU_URI)) {
+      if (!(gpuDeviceSerializable instanceof GpuDevice)) {
         throw new ResourceHandlerException(
             "Trying to recover device id, however it"
-                + " is not String, this shouldn't happen");
+                + " is not GpuDevice, this shouldn't happen");
       }
 
-
-      int devId;
-      try {
-        devId = Integer.parseInt((String)deviceId);
-      } catch (NumberFormatException e) {
-        throw new ResourceHandlerException("Failed to recover device id because"
-            + "it is not a valid integer, devId:" + deviceId);
-      }
+      GpuDevice gpuDevice = (GpuDevice) gpuDeviceSerializable;
 
       // Make sure it is in allowed GPU device.
-      if (!allowedGpuDevices.contains(devId)) {
-        throw new ResourceHandlerException("Try to recover device id = " + devId
-            + " however it is not in allowed device list:" + StringUtils
-            .join(",", allowedGpuDevices));
+      if (!allowedGpuDevices.contains(gpuDevice)) {
+        throw new ResourceHandlerException(
+            "Try to recover device = " + gpuDevice
+                + " however it is not in allowed device list:" + StringUtils
+                .join(",", allowedGpuDevices));
       }
 
       // Make sure it is not occupied by anybody else
-      if (usedDevices.containsKey(devId)) {
-        throw new ResourceHandlerException("Try to recover device id = " + devId
-            + " however it is already assigned to container=" + usedDevices
-            .get(devId) + ", please double check what happened.");
+      if (usedDevices.containsKey(gpuDevice)) {
+        throw new ResourceHandlerException(
+            "Try to recover device id = " + gpuDevice
+                + " however it is already assigned to container=" + usedDevices
+                .get(gpuDevice) + ", please double check what happened.");
       }
 
-      usedDevices.put(devId, containerId);
+      usedDevices.put(gpuDevice, containerId);
     }
   }
 
-  private int getRequestedGpus(Resource requestedResource) {
+  /**
+   * Get number of requested GPUs from resource.
+   * @param requestedResource requested resource
+   * @return #gpus.
+   */
+  public static int getRequestedGpus(Resource requestedResource) {
     try {
       return Long.valueOf(requestedResource.getResourceValue(
           GPU_URI)).intValue();
@@ -164,8 +163,8 @@ public class GpuResourceAllocator {
   /**
    * Assign GPU to requestor
    * @param container container to allocate
-   * @return List of denied Gpus with minor numbers
-   * @throws ResourceHandlerException When failed to
+   * @return allocation results.
+   * @throws ResourceHandlerException When failed to assign GPUs.
    */
   public synchronized GpuAllocation assignGpus(Container container)
       throws ResourceHandlerException {
@@ -180,12 +179,12 @@ public class GpuResourceAllocator {
                 containerId));
       }
 
-      Set<Integer> assignedGpus = new HashSet<>();
+      Set<GpuDevice> assignedGpus = new TreeSet<>();
 
-      for (int deviceNum : allowedGpuDevices) {
-        if (!usedDevices.containsKey(deviceNum)) {
-          usedDevices.put(deviceNum, containerId);
-          assignedGpus.add(deviceNum);
+      for (GpuDevice gpu : allowedGpuDevices) {
+        if (!usedDevices.containsKey(gpu)) {
+          usedDevices.put(gpu, containerId);
+          assignedGpus.add(gpu);
           if (assignedGpus.size() == numRequestedGpuDevices) {
             break;
           }
@@ -194,21 +193,10 @@ public class GpuResourceAllocator {
 
       // Record in state store if we allocated anything
       if (!assignedGpus.isEmpty()) {
-        List<Serializable> allocatedDevices = new ArrayList<>();
-        for (int gpu : assignedGpus) {
-          allocatedDevices.add(String.valueOf(gpu));
-        }
         try {
-          // Update Container#getResourceMapping.
-          ResourceMappings.AssignedResources assignedResources =
-              new ResourceMappings.AssignedResources();
-          assignedResources.updateAssignedResources(allocatedDevices);
-          container.getResourceMappings().addAssignedResources(GPU_URI,
-              assignedResources);
-
           // Update state store.
-          nmContext.getNMStateStore().storeAssignedResources(containerId,
-              GPU_URI, allocatedDevices);
+          nmContext.getNMStateStore().storeAssignedResources(container, GPU_URI,
+              new ArrayList<>(assignedGpus));
         } catch (IOException e) {
           cleanupAssignGpus(containerId);
           throw new ResourceHandlerException(e);
@@ -226,7 +214,7 @@ public class GpuResourceAllocator {
    * @param containerId containerId
    */
   public synchronized void cleanupAssignGpus(ContainerId containerId) {
-    Iterator<Map.Entry<Integer, ContainerId>> iter =
+    Iterator<Map.Entry<GpuDevice, ContainerId>> iter =
         usedDevices.entrySet().iterator();
     while (iter.hasNext()) {
       if (iter.next().getValue().equals(containerId)) {
@@ -236,7 +224,7 @@ public class GpuResourceAllocator {
   }
 
   @VisibleForTesting
-  public synchronized Map<Integer, ContainerId> getDeviceAllocationMapping() {
+  public synchronized Map<GpuDevice, ContainerId> getDeviceAllocationMapping() {
      return new HashMap<>(usedDevices);
   }
 }
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java
index 7144bb2..4a783d3 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java
@@ -24,8 +24,6 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.yarn.api.records.ContainerId;
-import org.apache.hadoop.yarn.api.records.ResourceInformation;
-import org.apache.hadoop.yarn.exceptions.ResourceNotFoundException;
 import org.apache.hadoop.yarn.exceptions.YarnException;
 import org.apache.hadoop.yarn.server.nodemanager.Context;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
@@ -35,6 +33,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileg
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
 
 import java.util.ArrayList;
@@ -64,17 +63,23 @@ public class GpuResourceHandlerImpl implements ResourceHandler {
   @Override
   public List<PrivilegedOperation> bootstrap(Configuration configuration)
       throws ResourceHandlerException {
-    List<Integer> minorNumbersOfUsableGpus;
+    List<GpuDevice> usableGpus;
     try {
-      minorNumbersOfUsableGpus = GpuDiscoverer.getInstance()
-          .getMinorNumbersOfGpusUsableByYarn();
+      usableGpus = GpuDiscoverer.getInstance()
+          .getGpusUsableByYarn();
+      if (usableGpus == null || usableGpus.isEmpty()) {
+        String message = "GPU is enabled on the NodeManager, but couldn't find "
+            + "any usable GPU devices, please double check configuration.";
+        LOG.error(message);
+        throw new ResourceHandlerException(message);
+      }
     } catch (YarnException e) {
       LOG.error("Exception when trying to get usable GPU device", e);
       throw new ResourceHandlerException(e);
     }
 
-    for (int minorNumber : minorNumbersOfUsableGpus) {
-      gpuAllocator.addGpu(minorNumber);
+    for (GpuDevice gpu : usableGpus) {
+      gpuAllocator.addGpu(gpu);
     }
 
     // And initialize cgroups
@@ -102,10 +107,13 @@ public class GpuResourceHandlerImpl implements ResourceHandler {
           PrivilegedOperation.OperationType.GPU, Arrays
           .asList(CONTAINER_ID_CLI_OPTION, containerIdStr));
       if (!allocation.getDeniedGPUs().isEmpty()) {
+        List<Integer> minorNumbers = new ArrayList<>();
+        for (GpuDevice deniedGpu : allocation.getDeniedGPUs()) {
+          minorNumbers.add(deniedGpu.getMinorNumber());
+        }
         privilegedOperation.appendArgs(Arrays.asList(EXCLUDED_GPUS_CLI_OPTION,
-            StringUtils.join(",", allocation.getDeniedGPUs())));
+            StringUtils.join(",", minorNumbers)));
       }
-
       privilegedOperationExecutor.executePrivilegedOperation(
           privilegedOperation, true);
     } catch (PrivilegedOperationException e) {
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDevice.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDevice.java
new file mode 100644
index 0000000..8119924
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDevice.java
@@ -0,0 +1,78 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
+
+import java.io.Serializable;
+
+/**
+ * This class is used to represent GPU device while allocation.
+ */
+public class GpuDevice implements Serializable, Comparable {
+  private int index;
+  private int minorNumber;
+  private static final long serialVersionUID = -6812314470754667710L;
+
+  public GpuDevice(int index, int minorNumber) {
+    this.index = index;
+    this.minorNumber = minorNumber;
+  }
+
+  public int getIndex() {
+    return index;
+  }
+
+  public int getMinorNumber() {
+    return minorNumber;
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (obj == null || !(obj instanceof GpuDevice)) {
+      return false;
+    }
+    GpuDevice other = (GpuDevice) obj;
+    return index == other.index && minorNumber == other.minorNumber;
+  }
+
+  @Override
+  public int compareTo(Object obj) {
+    if (obj == null || (!(obj instanceof  GpuDevice))) {
+      return -1;
+    }
+
+    GpuDevice other = (GpuDevice) obj;
+
+    int result = Integer.compare(index, other.index);
+    if (0 != result) {
+      return result;
+    }
+    return Integer.compare(minorNumber, other.minorNumber);
+  }
+
+  @Override
+  public int hashCode() {
+    final int prime = 47;
+    return prime * index + minorNumber;
+  }
+
+  @Override
+  public String toString() {
+    return "(index=" + index + ",minor_number=" + minorNumber + ")";
+  }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
index 61b8ce5..6e3cf13 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
@@ -136,12 +136,12 @@ public class GpuDiscoverer {
   }
 
   /**
-   * Get list of minor device numbers of Gpu devices usable by YARN.
+   * Get list of GPU devices usable by YARN.
    *
-   * @return List of minor device numbers of Gpu devices.
+   * @return List of GPU devices
    * @throws YarnException when any issue happens
    */
-  public synchronized List<Integer> getMinorNumbersOfGpusUsableByYarn()
+  public synchronized List<GpuDevice> getGpusUsableByYarn()
       throws YarnException {
     validateConfOrThrowException();
 
@@ -149,7 +149,7 @@ public class GpuDiscoverer {
         YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
         YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
 
-    List<Integer> minorNumbers = new ArrayList<>();
+    List<GpuDevice> gpuDevices = new ArrayList<>();
 
     if (allowedDevicesStr.equals(
         YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES)) {
@@ -167,21 +167,31 @@ public class GpuDiscoverer {
       }
 
       if (lastDiscoveredGpuInformation.getGpus() != null) {
-        for (PerGpuDeviceInformation gpu : lastDiscoveredGpuInformation
-            .getGpus()) {
-          minorNumbers.add(gpu.getMinorNumber());
+        for (int i = 0; i < lastDiscoveredGpuInformation.getGpus().size();
+             i++) {
+          List<PerGpuDeviceInformation> gpuInfos =
+              lastDiscoveredGpuInformation.getGpus();
+          gpuDevices.add(new GpuDevice(i, gpuInfos.get(i).getMinorNumber()));
         }
       }
     } else{
       for (String s : allowedDevicesStr.split(",")) {
         if (s.trim().length() > 0) {
-          minorNumbers.add(Integer.valueOf(s.trim()));
+          String[] kv = s.trim().split(":");
+          if (kv.length != 2) {
+            throw new YarnException(
+                "Illegal format, it should be index:minor_number format, now it="
+                    + s);
+          }
+
+          gpuDevices.add(
+              new GpuDevice(Integer.parseInt(kv[0]), Integer.parseInt(kv[1])));
         }
       }
-      LOG.info("Allowed GPU devices with minor numbers:" + allowedDevicesStr);
+      LOG.info("Allowed GPU devices:" + gpuDevices);
     }
 
-    return minorNumbers;
+    return gpuDevices;
   }
 
   public synchronized void initialize(Configuration conf) throws YarnException {
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java
index f6bf506..796eb25 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java
@@ -40,12 +40,14 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
   public void updateConfiguredResource(Resource res) throws YarnException {
     LOG.info("Initializing configured GPU resources for the NodeManager.");
 
-    List<Integer> usableGpus =
-        GpuDiscoverer.getInstance().getMinorNumbersOfGpusUsableByYarn();
+    List<GpuDevice> usableGpus =
+        GpuDiscoverer.getInstance().getGpusUsableByYarn();
     if (null == usableGpus || usableGpus.isEmpty()) {
-      LOG.info("Didn't find any usable GPUs on the NodeManager.");
+      String message = "GPU is enabled, but couldn't find any usable GPUs on the "
+          + "NodeManager.";
+      LOG.error(message);
       // No gpu can be used by YARN.
-      return;
+      throw new YarnException(message);
     }
 
     long nUsableGpus = usableGpus.size();
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java
index 2261db0..817c78d 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java
@@ -18,27 +18,9 @@
 
 package org.apache.hadoop.yarn.server.nodemanager.recovery;
 
-import static org.fusesource.leveldbjni.JniDBFactory.asString;
-import static org.fusesource.leveldbjni.JniDBFactory.bytes;
-
-import org.apache.hadoop.yarn.api.records.Token;
-import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
-import org.slf4j.LoggerFactory;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.Serializable;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Map.Entry;
-import java.util.Timer;
-import java.util.TimerTask;
-import java.util.Set;
-
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.ArrayListMultimap;
+import com.google.common.collect.ListMultimap;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -50,9 +32,11 @@ import org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainerRequestP
 import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
 import org.apache.hadoop.yarn.api.records.ApplicationId;
 import org.apache.hadoop.yarn.api.records.ContainerId;
+import org.apache.hadoop.yarn.api.records.Token;
 import org.apache.hadoop.yarn.api.records.impl.pb.ResourcePBImpl;
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.proto.YarnProtos.LocalResourceProto;
+import org.apache.hadoop.yarn.proto.YarnSecurityTokenProtos.ContainerTokenIdentifierProto;
 import org.apache.hadoop.yarn.proto.YarnServerCommonProtos.MasterKeyProto;
 import org.apache.hadoop.yarn.proto.YarnServerCommonProtos.VersionProto;
 import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.ContainerManagerApplicationProto;
@@ -60,9 +44,10 @@ import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.Deletion
 import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.LocalizedResourceProto;
 import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.LogDeleterProto;
 import org.apache.hadoop.yarn.proto.YarnServiceProtos.StartContainerRequestProto;
-import org.apache.hadoop.yarn.proto.YarnSecurityTokenProtos.ContainerTokenIdentifierProto;
+import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
 import org.apache.hadoop.yarn.server.api.records.MasterKey;
 import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
 import org.apache.hadoop.yarn.server.records.Version;
 import org.apache.hadoop.yarn.server.records.impl.pb.VersionPBImpl;
@@ -75,10 +60,26 @@ import org.iq80.leveldb.DB;
 import org.iq80.leveldb.DBException;
 import org.iq80.leveldb.Options;
 import org.iq80.leveldb.WriteBatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+import java.util.Timer;
+import java.util.TimerTask;
+
+import static org.fusesource.leveldbjni.JniDBFactory.asString;
+import static org.fusesource.leveldbjni.JniDBFactory.bytes;
 
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.collect.ArrayListMultimap;
-import com.google.common.collect.ListMultimap;
 
 public class NMLeveldbStateStoreService extends NMStateStoreService {
 
@@ -1184,15 +1185,18 @@ public class NMLeveldbStateStoreService extends NMStateStoreService {
   }
 
   @Override
-  public void storeAssignedResources(ContainerId containerId,
-      String resourceType, List<Serializable> assignedResources)
+  public void storeAssignedResources(Container container,
+      String resourceType, List<? extends Serializable> assignedResources)
       throws IOException {
     if (LOG.isDebugEnabled()) {
-      LOG.debug("storeAssignedResources: containerId=" + containerId
-          + ", assignedResources=" + StringUtils.join(",", assignedResources));
+      LOG.debug(
+          "storeAssignedResources: containerId=" + container.getContainerId()
+              + ", assignedResources=" + StringUtils
+              .join(",", assignedResources));
+
     }
 
-    String keyResChng = CONTAINERS_KEY_PREFIX + containerId.toString()
+    String keyResChng = CONTAINERS_KEY_PREFIX + container.getContainerId().toString()
         + CONTAINER_ASSIGNED_RESOURCES_KEY_SUFFIX + resourceType;
     try {
       WriteBatch batch = db.createWriteBatch();
@@ -1210,6 +1214,9 @@ public class NMLeveldbStateStoreService extends NMStateStoreService {
     } catch (DBException e) {
       throw new IOException(e);
     }
+
+    // update container resource mapping.
+    updateContainerResourceMapping(container, resourceType, assignedResources);
   }
 
   @SuppressWarnings("deprecation")
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java
index 6e3707b..77ed241 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java
@@ -35,6 +35,7 @@ import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.Localize
 import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.LogDeleterProto;
 import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
 import org.apache.hadoop.yarn.server.api.records.MasterKey;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
 
 // The state store to use when state isn't being stored
 public class NMNullStateStoreService extends NMStateStoreService {
@@ -268,8 +269,8 @@ public class NMNullStateStoreService extends NMStateStoreService {
   }
 
   @Override
-  public void storeAssignedResources(ContainerId containerId,
-      String resourceType, List<Serializable> assignedResources)
+  public void storeAssignedResources(Container container,
+      String resourceType, List<? extends Serializable> assignedResources)
       throws IOException {
   }
 
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java
index a929fe2..44f5e3f 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java
@@ -44,6 +44,7 @@ import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.Localize
 import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.LogDeleterProto;
 import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
 import org.apache.hadoop.yarn.server.api.records.MasterKey;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
 
 @Private
@@ -732,13 +733,13 @@ public abstract class NMStateStoreService extends AbstractService {
   /**
    * Store the assigned resources to a container.
    *
-   * @param containerId Container Id
+   * @param container NMContainer
    * @param resourceType Resource Type
    * @param assignedResources Assigned resources
    * @throws IOException if fails
    */
-  public abstract void storeAssignedResources(ContainerId containerId,
-      String resourceType, List<Serializable> assignedResources)
+  public abstract void storeAssignedResources(Container container,
+      String resourceType, List<? extends Serializable> assignedResources)
       throws IOException;
 
   protected abstract void initStorage(Configuration conf) throws IOException;
@@ -746,4 +747,14 @@ public abstract class NMStateStoreService extends AbstractService {
   protected abstract void startStorage() throws IOException;
 
   protected abstract void closeStorage() throws IOException;
+
+  protected void updateContainerResourceMapping(Container container,
+      String resourceType, List<? extends Serializable> assignedResources) {
+    // Update Container#getResourceMapping.
+    ResourceMappings.AssignedResources newAssigned =
+        new ResourceMappings.AssignedResources();
+    newAssigned.updateAssignedResources(assignedResources);
+    container.getResourceMappings().addAssignedResources(resourceType,
+        newAssigned);
+  }
 }
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
index 6c0c6de..89d06b9 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
@@ -519,16 +519,18 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest {
 
     commonLaunchContainer(appId, cid, cm);
 
+    Container nmContainer = context.getContainers().get(cid);
+
     Application app = context.getApplications().get(appId);
     assertNotNull(app);
 
     // store resource mapping of the container
-    List<Serializable> gpuResources = Arrays.asList("1", "2", "3");
-    stateStore.storeAssignedResources(cid, "gpu", gpuResources);
-    List<Serializable> numaResources = Arrays.asList("numa1");
-    stateStore.storeAssignedResources(cid, "numa", numaResources);
-    List<Serializable> fpgaResources = Arrays.asList("fpga1", "fpga2");
-    stateStore.storeAssignedResources(cid, "fpga", fpgaResources);
+    List<String> gpuResources = Arrays.asList("1", "2", "3");
+    stateStore.storeAssignedResources(nmContainer, "gpu", gpuResources);
+    List<String> numaResources = Arrays.asList("numa1");
+    stateStore.storeAssignedResources(nmContainer, "numa", numaResources);
+    List<String> fpgaResources = Arrays.asList("fpga1", "fpga2");
+    stateStore.storeAssignedResources(nmContainer, "fpga", fpgaResources);
 
     cm.stop();
     context = createContext(conf, stateStore);
@@ -540,7 +542,6 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest {
     app = context.getApplications().get(appId);
     assertNotNull(app);
 
-    Container nmContainer = context.getContainers().get(cid);
     Assert.assertNotNull(nmContainer);
     ResourceMappings resourceMappings = nmContainer.getResourceMappings();
     List<Serializable> assignedResource = resourceMappings
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java
index 5c70f7a..234d309 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java
@@ -20,7 +20,6 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resourc
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.yarn.api.protocolrecords.ResourceTypes;
 import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
 import org.apache.hadoop.yarn.api.records.ApplicationId;
 import org.apache.hadoop.yarn.api.records.ContainerId;
@@ -36,15 +35,17 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileg
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeConstants;
 import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
-import org.apache.hadoop.yarn.util.resource.ResourceUtils;
 import org.apache.hadoop.yarn.util.resource.TestResourceUtils;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
 
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
@@ -90,7 +91,7 @@ public class TestGpuResourceHandler {
   @Test
   public void testBootStrap() throws Exception {
     Configuration conf = new YarnConfiguration();
-    conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0");
+    conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0");
 
     GpuDiscoverer.getInstance().initialize(conf);
 
@@ -104,8 +105,8 @@ public class TestGpuResourceHandler {
         .newInstance(ApplicationId.newInstance(1234L, 1), 1), id);
   }
 
-  private static Container mockContainerWithGpuRequest(int id,
-      int numGpuRequest) {
+  private static Container mockContainerWithGpuRequest(int id, int numGpuRequest,
+      boolean dockerContainerEnabled) {
     Container c = mock(Container.class);
     when(c.getContainerId()).thenReturn(getContainerId(id));
 
@@ -115,29 +116,48 @@ public class TestGpuResourceHandler {
     res.setResourceValue(ResourceInformation.GPU_URI, numGpuRequest);
     when(c.getResource()).thenReturn(res);
     when(c.getResourceMappings()).thenReturn(resMapping);
+
+    ContainerLaunchContext clc = mock(ContainerLaunchContext.class);
+    Map<String, String> env = new HashMap<>();
+    if (dockerContainerEnabled) {
+      env.put(ContainerRuntimeConstants.ENV_CONTAINER_TYPE, "docker");
+    }
+    when(clc.getEnvironment()).thenReturn(env);
+    when(c.getLaunchContext()).thenReturn(clc);
     return c;
   }
 
+  private static Container mockContainerWithGpuRequest(int id,
+      int numGpuRequest) {
+    return mockContainerWithGpuRequest(id, numGpuRequest, false);
+  }
+
   private void verifyDeniedDevices(ContainerId containerId,
-      List<Integer> deniedDevices)
+      List<GpuDevice> deniedDevices)
       throws ResourceHandlerException, PrivilegedOperationException {
     verify(mockCGroupsHandler, times(1)).createCGroup(
         CGroupsHandler.CGroupController.DEVICES, containerId.toString());
 
     if (null != deniedDevices && !deniedDevices.isEmpty()) {
+      List<Integer> deniedDevicesMinorNumber = new ArrayList<>();
+      for (GpuDevice deniedDevice : deniedDevices) {
+        deniedDevicesMinorNumber.add(deniedDevice.getMinorNumber());
+      }
+      PrivilegedOperation arg = new PrivilegedOperation(PrivilegedOperation.OperationType.GPU, Arrays
+          .asList(GpuResourceHandlerImpl.CONTAINER_ID_CLI_OPTION,
+              containerId.toString(),
+              GpuResourceHandlerImpl.EXCLUDED_GPUS_CLI_OPTION,
+              StringUtils.join(",", deniedDevicesMinorNumber)));
+      for (String s : arg.getArguments()) {System.out.println("EXPECTED ARGS " + s);}
       verify(mockPrivilegedExecutor, times(1)).executePrivilegedOperation(
-          new PrivilegedOperation(PrivilegedOperation.OperationType.GPU, Arrays
-              .asList(GpuResourceHandlerImpl.CONTAINER_ID_CLI_OPTION,
-                  containerId.toString(),
-                  GpuResourceHandlerImpl.EXCLUDED_GPUS_CLI_OPTION,
-                  StringUtils.join(",", deniedDevices))), true);
+          arg, true);
     }
   }
 
-  @Test
-  public void testAllocation() throws Exception {
+  private void commonTestAllocation(boolean dockerContainerEnabled)
+      throws Exception {
     Configuration conf = new YarnConfiguration();
-    conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
+    conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
     GpuDiscoverer.getInstance().initialize(conf);
 
     gpuResourceHandler.bootstrap(conf);
@@ -145,31 +165,52 @@ public class TestGpuResourceHandler {
         gpuResourceHandler.getGpuAllocator().getAvailableGpus());
 
     /* Start container 1, asks 3 containers */
-    gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 3));
+    gpuResourceHandler.preStart(
+        mockContainerWithGpuRequest(1, 3, dockerContainerEnabled));
 
     // Only device=4 will be blocked.
-    verifyDeniedDevices(getContainerId(1), Arrays.asList(4));
+    if (dockerContainerEnabled) {
+      verifyDeniedDevices(getContainerId(1), Collections.<GpuDevice>emptyList());
+    } else{
+      verifyDeniedDevices(getContainerId(1), Arrays.asList(new GpuDevice(3,4)));
+    }
 
     /* Start container 2, asks 2 containers. Excepted to fail */
     boolean failedToAllocate = false;
     try {
-      gpuResourceHandler.preStart(mockContainerWithGpuRequest(2, 2));
+      gpuResourceHandler.preStart(
+          mockContainerWithGpuRequest(2, 2, dockerContainerEnabled));
     } catch (ResourceHandlerException e) {
       failedToAllocate = true;
     }
     Assert.assertTrue(failedToAllocate);
 
     /* Start container 3, ask 1 container, succeeded */
-    gpuResourceHandler.preStart(mockContainerWithGpuRequest(3, 1));
+    gpuResourceHandler.preStart(
+        mockContainerWithGpuRequest(3, 1, dockerContainerEnabled));
 
     // devices = 0/1/3 will be blocked
-    verifyDeniedDevices(getContainerId(3), Arrays.asList(0, 1, 3));
+    if (dockerContainerEnabled) {
+      verifyDeniedDevices(getContainerId(3), Collections.<GpuDevice>emptyList());
+    } else {
+      verifyDeniedDevices(getContainerId(3), Arrays
+          .asList(new GpuDevice(0, 0), new GpuDevice(1, 1),
+              new GpuDevice(2, 3)));
+    }
 
-    /* Start container 4, ask 0 container, succeeded */
-    gpuResourceHandler.preStart(mockContainerWithGpuRequest(4, 0));
 
-    // All devices will be blocked
-    verifyDeniedDevices(getContainerId(4), Arrays.asList(0, 1, 3, 4));
+    /* Start container 4, ask 0 container, succeeded */
+    gpuResourceHandler.preStart(
+        mockContainerWithGpuRequest(4, 0, dockerContainerEnabled));
+
+    if (dockerContainerEnabled) {
+      verifyDeniedDevices(getContainerId(4), Collections.<GpuDevice>emptyList());
+    } else{
+      // All devices will be blocked
+      verifyDeniedDevices(getContainerId(4), Arrays
+          .asList(new GpuDevice(0, 0), new GpuDevice(1, 1), new GpuDevice(2, 3),
+              new GpuDevice(3, 4)));
+    }
 
     /* Release container-1, expect cgroups deleted */
     gpuResourceHandler.postComplete(getContainerId(1));
@@ -188,12 +229,24 @@ public class TestGpuResourceHandler {
         gpuResourceHandler.getGpuAllocator().getAvailableGpus());
   }
 
+  @Test
+  public void testAllocationWhenDockerContainerEnabled() throws Exception {
+    // When docker container is enabled, no devices should be written to
+    // devices.deny.
+    commonTestAllocation(true);
+  }
+
+  @Test
+  public void testAllocation() throws Exception {
+    commonTestAllocation(false);
+  }
+
   @SuppressWarnings("unchecked")
   @Test
   public void testAssignedGpuWillBeCleanedupWhenStoreOpFails()
       throws Exception {
     Configuration conf = new YarnConfiguration();
-    conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
+    conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
     GpuDiscoverer.getInstance().initialize(conf);
 
     gpuResourceHandler.bootstrap(conf);
@@ -202,7 +255,7 @@ public class TestGpuResourceHandler {
 
     doThrow(new IOException("Exception ...")).when(mockNMStateStore)
         .storeAssignedResources(
-        any(ContainerId.class), anyString(), anyList());
+        any(Container.class), anyString(), anyList());
 
     boolean exception = false;
     /* Start container 1, asks 3 containers */
@@ -225,13 +278,16 @@ public class TestGpuResourceHandler {
     conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " ");
     GpuDiscoverer.getInstance().initialize(conf);
 
-    gpuResourceHandler.bootstrap(conf);
-    Assert.assertEquals(0,
-        gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+    try {
+      gpuResourceHandler.bootstrap(conf);
+      Assert.fail("Should fail because no GPU available");
+    } catch (ResourceHandlerException e) {
+      // Expected because of no resource available
+    }
 
     /* Start container 1, asks 0 containers */
     gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 0));
-    verifyDeniedDevices(getContainerId(1), Collections.emptyList());
+    verifyDeniedDevices(getContainerId(1), Collections.<GpuDevice>emptyList());
 
     /* Start container 2, asks 1 containers. Excepted to fail */
     boolean failedToAllocate = false;
@@ -254,7 +310,7 @@ public class TestGpuResourceHandler {
   @Test
   public void testAllocationStored() throws Exception {
     Configuration conf = new YarnConfiguration();
-    conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
+    conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
     GpuDiscoverer.getInstance().initialize(conf);
 
     gpuResourceHandler.bootstrap(conf);
@@ -265,33 +321,33 @@ public class TestGpuResourceHandler {
     Container container = mockContainerWithGpuRequest(1, 3);
     gpuResourceHandler.preStart(container);
 
-    verify(mockNMStateStore).storeAssignedResources(getContainerId(1),
-        ResourceInformation.GPU_URI,
-        Arrays.asList("0", "1", "3"));
-
-    Assert.assertEquals(3, container.getResourceMappings()
-        .getAssignedResources(ResourceInformation.GPU_URI).size());
+    verify(mockNMStateStore).storeAssignedResources(container,
+        ResourceInformation.GPU_URI, Arrays
+            .asList(new GpuDevice(0, 0), new GpuDevice(1, 1),
+                new GpuDevice(2, 3)));
 
     // Only device=4 will be blocked.
-    verifyDeniedDevices(getContainerId(1), Arrays.asList(4));
+    verifyDeniedDevices(getContainerId(1), Arrays.<GpuDevice>asList(new GpuDevice(3, 4)));
 
     /* Start container 2, ask 0 container, succeeded */
     container = mockContainerWithGpuRequest(2, 0);
     gpuResourceHandler.preStart(container);
 
-    verifyDeniedDevices(getContainerId(2), Arrays.asList(0, 1, 3, 4));
+    verifyDeniedDevices(getContainerId(2), Arrays
+        .asList(new GpuDevice(0, 0), new GpuDevice(1, 1), new GpuDevice(2, 3),
+            new GpuDevice(3, 4)));
     Assert.assertEquals(0, container.getResourceMappings()
         .getAssignedResources(ResourceInformation.GPU_URI).size());
 
     // Store assigned resource will not be invoked.
     verify(mockNMStateStore, never()).storeAssignedResources(
-        eq(getContainerId(2)), eq(ResourceInformation.GPU_URI), anyList());
+        eq(container), eq(ResourceInformation.GPU_URI), anyList());
   }
 
   @Test
   public void testRecoverResourceAllocation() throws Exception {
     Configuration conf = new YarnConfiguration();
-    conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
+    conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
     GpuDiscoverer.getInstance().initialize(conf);
 
     gpuResourceHandler.bootstrap(conf);
@@ -302,7 +358,8 @@ public class TestGpuResourceHandler {
     ResourceMappings rmap = new ResourceMappings();
     ResourceMappings.AssignedResources ar =
         new ResourceMappings.AssignedResources();
-    ar.updateAssignedResources(Arrays.asList("1", "3"));
+    ar.updateAssignedResources(
+        Arrays.asList(new GpuDevice(1, 1), new GpuDevice(2, 3)));
     rmap.addAssignedResources(ResourceInformation.GPU_URI, ar);
     when(nmContainer.getResourceMappings()).thenReturn(rmap);
 
@@ -312,12 +369,15 @@ public class TestGpuResourceHandler {
     // Reacquire container restore state of GPU Resource Allocator.
     gpuResourceHandler.reacquireContainer(getContainerId(1));
 
-    Map<Integer, ContainerId> deviceAllocationMapping =
+    Map<GpuDevice, ContainerId> deviceAllocationMapping =
         gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping();
     Assert.assertEquals(2, deviceAllocationMapping.size());
     Assert.assertTrue(
-        deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3)));
-    Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1));
+        deviceAllocationMapping.keySet().contains(new GpuDevice(1, 1)));
+    Assert.assertTrue(
+        deviceAllocationMapping.keySet().contains(new GpuDevice(2, 3)));
+    Assert.assertEquals(deviceAllocationMapping.get(new GpuDevice(1, 1)),
+        getContainerId(1));
 
     // TEST CASE
     // Try to reacquire a container but requested device is not in allowed list.
@@ -325,7 +385,8 @@ public class TestGpuResourceHandler {
     rmap = new ResourceMappings();
     ar = new ResourceMappings.AssignedResources();
     // id=5 is not in allowed list.
-    ar.updateAssignedResources(Arrays.asList("4", "5"));
+    ar.updateAssignedResources(
+        Arrays.asList(new GpuDevice(3, 4), new GpuDevice(4, 5)));
     rmap.addAssignedResources(ResourceInformation.GPU_URI, ar);
     when(nmContainer.getResourceMappings()).thenReturn(rmap);
 
@@ -345,9 +406,10 @@ public class TestGpuResourceHandler {
     deviceAllocationMapping =
         gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping();
     Assert.assertEquals(2, deviceAllocationMapping.size());
-    Assert.assertTrue(
-        deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3)));
-    Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1));
+    Assert.assertTrue(deviceAllocationMapping.keySet()
+        .containsAll(Arrays.asList(new GpuDevice(1, 1), new GpuDevice(2, 3))));
+    Assert.assertEquals(deviceAllocationMapping.get(new GpuDevice(1, 1)),
+        getContainerId(1));
 
     // TEST CASE
     // Try to reacquire a container but requested device is already assigned.
@@ -355,7 +417,8 @@ public class TestGpuResourceHandler {
     rmap = new ResourceMappings();
     ar = new ResourceMappings.AssignedResources();
     // id=3 is already assigned
-    ar.updateAssignedResources(Arrays.asList("4", "3"));
+    ar.updateAssignedResources(
+        Arrays.asList(new GpuDevice(3, 4), new GpuDevice(2, 3)));
     rmap.addAssignedResources("gpu", ar);
     when(nmContainer.getResourceMappings()).thenReturn(rmap);
 
@@ -375,8 +438,9 @@ public class TestGpuResourceHandler {
     deviceAllocationMapping =
         gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping();
     Assert.assertEquals(2, deviceAllocationMapping.size());
-    Assert.assertTrue(
-        deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3)));
-    Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1));
+    Assert.assertTrue(deviceAllocationMapping.keySet()
+        .containsAll(Arrays.asList(new GpuDevice(1, 1), new GpuDevice(2, 3))));
+    Assert.assertEquals(deviceAllocationMapping.get(new GpuDevice(1, 1)),
+        getContainerId(1));
   }
 }
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
index 83bace2..4abb633 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
@@ -101,23 +101,41 @@ public class TestGpuDiscoverer {
     GpuDeviceInformation info = plugin.getGpuDeviceInformation();
 
     Assert.assertTrue(info.getGpus().size() > 0);
-    Assert.assertEquals(plugin.getMinorNumbersOfGpusUsableByYarn().size(),
+    Assert.assertEquals(plugin.getGpusUsableByYarn().size(),
         info.getGpus().size());
   }
 
   @Test
   public void getNumberOfUsableGpusFromConfig() throws YarnException {
     Configuration conf = new Configuration(false);
-    conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,2,4");
+
+    // Illegal format
+    conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:2,3");
     GpuDiscoverer plugin = new GpuDiscoverer();
+    try {
+      plugin.initialize(conf);
+      plugin.getGpusUsableByYarn();
+      Assert.fail("Illegal format, should fail.");
+    } catch (YarnException e) {
+      // Expected
+    }
+
+    // Valid format
+    conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:2,3:4");
+    plugin = new GpuDiscoverer();
     plugin.initialize(conf);
 
-    List<Integer> minorNumbers = plugin.getMinorNumbersOfGpusUsableByYarn();
-    Assert.assertEquals(4, minorNumbers.size());
+    List<GpuDevice> usableGpuDevices = plugin.getGpusUsableByYarn();
+    Assert.assertEquals(4, usableGpuDevices.size());
+
+    Assert.assertTrue(0 == usableGpuDevices.get(0).getIndex());
+    Assert.assertTrue(1 == usableGpuDevices.get(1).getIndex());
+    Assert.assertTrue(2 == usableGpuDevices.get(2).getIndex());
+    Assert.assertTrue(3 == usableGpuDevices.get(3).getIndex());
 
-    Assert.assertTrue(0 == minorNumbers.get(0));
-    Assert.assertTrue(1 == minorNumbers.get(1));
-    Assert.assertTrue(2 == minorNumbers.get(2));
-    Assert.assertTrue(4 == minorNumbers.get(3));
+    Assert.assertTrue(0 == usableGpuDevices.get(0).getMinorNumber());
+    Assert.assertTrue(1 == usableGpuDevices.get(1).getMinorNumber());
+    Assert.assertTrue(2 == usableGpuDevices.get(2).getMinorNumber());
+    Assert.assertTrue(4 == usableGpuDevices.get(3).getMinorNumber());
   }
 }
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java
index 5d424ad..695544c 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java
@@ -43,6 +43,7 @@ import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.LogDelet
 import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
 import org.apache.hadoop.yarn.server.api.records.MasterKey;
 import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
 
 
@@ -515,13 +516,13 @@ public class NMMemoryStateStoreService extends NMStateStoreService {
   }
 
   @Override
-  public void storeAssignedResources(ContainerId containerId,
-      String resourceType, List<Serializable> assignedResources)
+  public void storeAssignedResources(Container container,
+      String resourceType, List<? extends Serializable> assignedResources)
       throws IOException {
     ResourceMappings.AssignedResources ar =
         new ResourceMappings.AssignedResources();
     ar.updateAssignedResources(assignedResources);
-    containerStates.get(containerId).getResourceMappings()
+    containerStates.get(container.getContainerId()).getResourceMappings()
         .addAssignedResources(resourceType, ar);
   }
 
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java
index bc17868..f51a230 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java
@@ -32,6 +32,7 @@ import static org.mockito.Mockito.spy;
 import static org.mockito.Mockito.timeout;
 import static org.mockito.Mockito.times;
 import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
 
 import java.io.File;
 import java.io.IOException;
@@ -72,6 +73,8 @@ import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.LogDelet
 import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
 import org.apache.hadoop.yarn.server.api.records.MasterKey;
 import org.apache.hadoop.yarn.server.nodemanager.amrmproxy.AMRMProxyTokenSecretManager;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
 import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.LocalResourceTrackerState;
 import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredAMRMProxyState;
 import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredApplicationsState;
@@ -1151,16 +1154,21 @@ public class TestNMLeveldbStateStoreService {
     ContainerId containerId = ContainerId.newContainerId(appAttemptId, 5);
     storeMockContainer(containerId);
 
+    Container container = mock(Container.class);
+    when(container.getContainerId()).thenReturn(containerId);
+    ResourceMappings resourceMappings = new ResourceMappings();
+    when(container.getResourceMappings()).thenReturn(resourceMappings);
+
     // Store ResourceMapping
-    stateStore.storeAssignedResources(containerId, "gpu",
+    stateStore.storeAssignedResources(container, "gpu",
         Arrays.asList("1", "2", "3"));
     // This will overwrite above
-    List<Serializable> gpuRes1 = Arrays.asList("1", "2", "4");
-    stateStore.storeAssignedResources(containerId, "gpu", gpuRes1);
-    List<Serializable> fpgaRes = Arrays.asList("3", "4", "5", "6");
-    stateStore.storeAssignedResources(containerId, "fpga", fpgaRes);
-    List<Serializable> numaRes = Arrays.asList("numa1");
-    stateStore.storeAssignedResources(containerId, "numa", numaRes);
+    List<String> gpuRes1 = Arrays.asList("1", "2", "4");
+    stateStore.storeAssignedResources(container, "gpu", gpuRes1);
+    List<String> fpgaRes = Arrays.asList("3", "4", "5", "6");
+    stateStore.storeAssignedResources(container, "fpga", fpgaRes);
+    List<String> numaRes = Arrays.asList("numa1");
+    stateStore.storeAssignedResources(container, "numa", numaRes);
 
     // add a invalid key
     restartStateStore();
@@ -1170,12 +1178,18 @@ public class TestNMLeveldbStateStoreService {
     List<Serializable> res = rcs.getResourceMappings()
         .getAssignedResources("gpu");
     Assert.assertTrue(res.equals(gpuRes1));
+    Assert.assertTrue(
+        resourceMappings.getAssignedResources("gpu").equals(gpuRes1));
 
     res = rcs.getResourceMappings().getAssignedResources("fpga");
     Assert.assertTrue(res.equals(fpgaRes));
+    Assert.assertTrue(
+        resourceMappings.getAssignedResources("fpga").equals(fpgaRes));
 
     res = rcs.getResourceMappings().getAssignedResources("numa");
     Assert.assertTrue(res.equals(numaRes));
+    Assert.assertTrue(
+        resourceMappings.getAssignedResources("numa").equals(numaRes));
   }
 
   private StartContainerRequest storeMockContainer(ContainerId containerId)


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org


Mime
View raw message