hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sun...@apache.org
Subject [2/3] hadoop git commit: YARN-6620. Add support in NodeManager to isolate GPU devices by using CGroups. Contributed by Wangda Tan.
Date Wed, 11 Oct 2017 18:16:11 GMT
http://git-wip-us.apache.org/repos/asf/hadoop/blob/fa5cfc68/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java
new file mode 100644
index 0000000..9576ce7
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
+
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.apache.hadoop.yarn.server.nodemanager.Context;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu.GpuResourceHandlerImpl;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.NodeResourceUpdaterPlugin;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin;
+
+public class GpuResourcePlugin implements ResourcePlugin {
+  private ResourceHandler gpuResourceHandler = null;
+  private GpuNodeResourceUpdateHandler resourceDiscoverHandler = null;
+
+  @Override
+  public synchronized void initialize(Context context) throws YarnException {
+    resourceDiscoverHandler = new GpuNodeResourceUpdateHandler();
+    GpuDiscoverer.getInstance().initialize(context.getConf());
+  }
+
+  @Override
+  public synchronized ResourceHandler createResourceHandler(
+      Context context, CGroupsHandler cGroupsHandler,
+      PrivilegedOperationExecutor privilegedOperationExecutor) {
+    if (gpuResourceHandler == null) {
+      gpuResourceHandler = new GpuResourceHandlerImpl(context, cGroupsHandler,
+          privilegedOperationExecutor);
+    }
+
+    return gpuResourceHandler;
+  }
+
+  @Override
+  public synchronized NodeResourceUpdaterPlugin getNodeResourceHandlerInstance() {
+    return resourceDiscoverHandler;
+  }
+
+  @Override
+  public void cleanup() throws YarnException {
+    // Do nothing.
+  }
+}

http://git-wip-us.apache.org/repos/asf/hadoop/blob/fa5cfc68/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformation.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformation.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformation.java
new file mode 100644
index 0000000..977032a
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformation.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+import javax.xml.bind.annotation.XmlRootElement;
+import java.util.List;
+
+/**
+ * All GPU Device Information in the system.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+@XmlRootElement(name = "nvidia_smi_log")
+public class GpuDeviceInformation {
+  List<PerGpuDeviceInformation> gpus;
+
+  String driverVersion = "N/A";
+
+  // More fields like topology information could be added when needed.
+  // ...
+
+  @javax.xml.bind.annotation.XmlElement(name = "gpu")
+  public List<PerGpuDeviceInformation> getGpus() {
+    return gpus;
+  }
+
+  public void setGpus(List<PerGpuDeviceInformation> gpus) {
+    this.gpus = gpus;
+  }
+
+  @javax.xml.bind.annotation.XmlElement(name = "driver_version")
+  public String getDriverVersion() {
+    return driverVersion;
+  }
+
+  public void setDriverVersion(String driverVersion) {
+    this.driverVersion = driverVersion;
+  }
+
+  @Override
+  public String toString() {
+    StringBuilder sb = new StringBuilder();
+    sb.append("=== Gpus in the system ===\n").append("\tDriver Version:").append(
+        getDriverVersion()).append("\n");
+
+    if (gpus != null) {
+      for (PerGpuDeviceInformation gpu : gpus) {
+        sb.append("\t").append(gpu.toString()).append("\n");
+      }
+    }
+    return sb.toString();
+  }
+}

http://git-wip-us.apache.org/repos/asf/hadoop/blob/fa5cfc68/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java
new file mode 100644
index 0000000..1bd92f6
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java
@@ -0,0 +1,87 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
+
+import javax.xml.bind.JAXBContext;
+import javax.xml.bind.JAXBException;
+import javax.xml.bind.Unmarshaller;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParserFactory;
+import javax.xml.transform.sax.SAXSource;
+import java.io.StringReader;
+
+/**
+ * Parse XML and get GPU device information
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+public class GpuDeviceInformationParser {
+  private static final Logger LOG = LoggerFactory.getLogger(
+      GpuDeviceInformationParser.class);
+
+  private Unmarshaller unmarshaller = null;
+  private XMLReader xmlReader = null;
+
+  private void init()
+      throws SAXException, ParserConfigurationException, JAXBException {
+    SAXParserFactory spf = SAXParserFactory.newInstance();
+    // Disable external-dtd since by default nvidia-smi output contains
+    // <!DOCTYPE nvidia_smi_log SYSTEM "nvsmi_device_v8.dtd"> in header
+    spf.setFeature(
+        "http://apache.org/xml/features/nonvalidating/load-external-dtd",
+        false);
+    spf.setFeature("http://xml.org/sax/features/validation", false);
+
+    JAXBContext jaxbContext = JAXBContext.newInstance(
+        GpuDeviceInformation.class);
+
+    this.xmlReader = spf.newSAXParser().getXMLReader();
+    this.unmarshaller = jaxbContext.createUnmarshaller();
+  }
+
+  public synchronized GpuDeviceInformation parseXml(String xmlContent)
+      throws YarnException {
+    if (unmarshaller == null) {
+      try {
+        init();
+      } catch (SAXException | ParserConfigurationException | JAXBException e) {
+        LOG.error("Exception while initialize parser", e);
+        throw new YarnException(e);
+      }
+    }
+
+    InputSource inputSource = new InputSource(new StringReader(xmlContent));
+    SAXSource source = new SAXSource(xmlReader, inputSource);
+    try {
+      return (GpuDeviceInformation) unmarshaller.unmarshal(source);
+    } catch (JAXBException e) {
+      LOG.error("Exception while parsing xml", e);
+      throw new YarnException(e);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/hadoop/blob/fa5cfc68/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuDeviceInformation.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuDeviceInformation.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuDeviceInformation.java
new file mode 100644
index 0000000..f315313
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuDeviceInformation.java
@@ -0,0 +1,165 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+import javax.xml.bind.annotation.XmlElement;
+import javax.xml.bind.annotation.XmlRootElement;
+import javax.xml.bind.annotation.adapters.XmlAdapter;
+
+/**
+ * Capture single GPU device information such as memory size, temperature,
+ * utilization.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+@XmlRootElement(name = "gpu")
+public class PerGpuDeviceInformation {
+
+  private String productName = "N/A";
+  private String uuid = "N/A";
+  private int minorNumber = -1;
+
+  private PerGpuUtilizations gpuUtilizations;
+  private PerGpuMemoryUsage gpuMemoryUsage;
+  private PerGpuTemperature temperature;
+
+  /**
+   * Convert formats like "34 C", "75.6 %" to float.
+   */
+  @InterfaceAudience.Private
+  @InterfaceStability.Unstable
+  static class StrToFloatBeforeSpaceAdapter extends
+      XmlAdapter<String, Float> {
+    @Override
+    public String marshal(Float v) throws Exception {
+      if (v == null) {
+        return "";
+      }
+      return String.valueOf(v);
+    }
+
+    @Override
+    public Float unmarshal(String v) throws Exception {
+      if (v == null) {
+        return -1f;
+      }
+
+      return Float.valueOf(v.split(" ")[0]);
+    }
+  }
+
+  /**
+   * Convert formats like "725 MiB" to long.
+   */
+  @InterfaceAudience.Private
+  @InterfaceStability.Unstable
+  static class StrToMemAdapter extends XmlAdapter<String, Long> {
+    @Override
+    public String marshal(Long v) throws Exception {
+      if (v == null) {
+        return "";
+      }
+      return String.valueOf(v) + " MiB";
+    }
+
+    @Override
+    public Long unmarshal(String v) throws Exception {
+      if (v == null) {
+        return -1L;
+      }
+      return Long.valueOf(v.split(" ")[0]);
+    }
+  }
+
+  @XmlElement(name = "temperature")
+  public PerGpuTemperature getTemperature() {
+    return temperature;
+  }
+
+  public void setTemperature(PerGpuTemperature temperature) {
+    this.temperature = temperature;
+  }
+
+  @XmlElement(name = "uuid")
+  public String getUuid() {
+    return uuid;
+  }
+
+  public void setUuid(String uuid) {
+    this.uuid = uuid;
+  }
+
+  @XmlElement(name = "product_name")
+  public String getProductName() {
+    return productName;
+  }
+
+  public void setProductName(String productName) {
+    this.productName = productName;
+  }
+
+  @XmlElement(name = "minor_number")
+  public int getMinorNumber() {
+    return minorNumber;
+  }
+
+  public void setMinorNumber(int minorNumber) {
+    this.minorNumber = minorNumber;
+  }
+
+  @XmlElement(name = "utilization")
+  public PerGpuUtilizations getGpuUtilizations() {
+    return gpuUtilizations;
+  }
+
+  public void setGpuUtilizations(PerGpuUtilizations utilizations) {
+    this.gpuUtilizations = utilizations;
+  }
+
+  @XmlElement(name = "bar1_memory_usage")
+  public PerGpuMemoryUsage getGpuMemoryUsage() {
+    return gpuMemoryUsage;
+  }
+
+  public void setGpuMemoryUsage(PerGpuMemoryUsage gpuMemoryUsage) {
+    this.gpuMemoryUsage = gpuMemoryUsage;
+  }
+
+
+  @Override
+  public String toString() {
+    StringBuilder sb = new StringBuilder();
+    sb.append("ProductName=").append(productName).append(", MinorNumber=")
+        .append(minorNumber);
+
+    if (getGpuMemoryUsage() != null) {
+      sb.append(", TotalMemory=").append(
+          getGpuMemoryUsage().getTotalMemoryMiB()).append("MiB");
+    }
+
+    if (getGpuUtilizations() != null) {
+      sb.append(", Utilization=").append(
+          getGpuUtilizations().getOverallGpuUtilization()).append("%");
+    }
+    return sb.toString();
+  }
+}

http://git-wip-us.apache.org/repos/asf/hadoop/blob/fa5cfc68/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuMemoryUsage.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuMemoryUsage.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuMemoryUsage.java
new file mode 100644
index 0000000..3964c4e
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuMemoryUsage.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+import javax.xml.bind.annotation.XmlElement;
+import javax.xml.bind.annotation.XmlRootElement;
+import javax.xml.bind.annotation.adapters.XmlJavaTypeAdapter;
+
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+@XmlRootElement(name = "bar1_memory_usage")
+public class PerGpuMemoryUsage {
+  long usedMemoryMiB = -1L;
+  long availMemoryMiB = -1L;
+
+  @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToMemAdapter.class)
+  @XmlElement(name = "used")
+  public Long getUsedMemoryMiB() {
+    return usedMemoryMiB;
+  }
+
+  public void setUsedMemoryMiB(Long usedMemoryMiB) {
+    this.usedMemoryMiB = usedMemoryMiB;
+  }
+
+  @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToMemAdapter.class)
+  @XmlElement(name = "free")
+  public Long getAvailMemoryMiB() {
+    return availMemoryMiB;
+  }
+
+  public void setAvailMemoryMiB(Long availMemoryMiB) {
+    this.availMemoryMiB = availMemoryMiB;
+  }
+
+  public long getTotalMemoryMiB() {
+    return usedMemoryMiB + availMemoryMiB;
+  }
+}

http://git-wip-us.apache.org/repos/asf/hadoop/blob/fa5cfc68/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuTemperature.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuTemperature.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuTemperature.java
new file mode 100644
index 0000000..ccd60cb
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuTemperature.java
@@ -0,0 +1,80 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+import javax.xml.bind.annotation.XmlElement;
+import javax.xml.bind.annotation.XmlRootElement;
+import javax.xml.bind.annotation.adapters.XmlJavaTypeAdapter;
+
+/**
+ * Temperature of GPU
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+@XmlRootElement(name = "temperature")
+public class PerGpuTemperature {
+  private float currentGpuTemp = Float.MIN_VALUE;
+  private float maxGpuTemp = Float.MIN_VALUE;
+  private float slowThresholdGpuTemp = Float.MIN_VALUE;
+
+  /**
+   * Get current celsius GPU temperature
+   * @return temperature
+   */
+  @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class)
+  @XmlElement(name = "gpu_temp")
+  public Float getCurrentGpuTemp() {
+    return currentGpuTemp;
+  }
+
+  public void setCurrentGpuTemp(Float currentGpuTemp) {
+    this.currentGpuTemp = currentGpuTemp;
+  }
+
+  /**
+   * Get max possible celsius GPU temperature
+   * @return temperature
+   */
+  @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class)
+  @XmlElement(name = "gpu_temp_max_threshold")
+  public Float getMaxGpuTemp() {
+    return maxGpuTemp;
+  }
+
+  public void setMaxGpuTemp(Float maxGpuTemp) {
+    this.maxGpuTemp = maxGpuTemp;
+  }
+
+  /**
+   * Get celsius GPU temperature which could make GPU runs slower
+   * @return temperature
+   */
+  @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class)
+  @XmlElement(name = "gpu_temp_slow_threshold")
+  public Float getSlowThresholdGpuTemp() {
+    return slowThresholdGpuTemp;
+  }
+
+  public void setSlowThresholdGpuTemp(Float slowThresholdGpuTemp) {
+    this.slowThresholdGpuTemp = slowThresholdGpuTemp;
+  }
+}

http://git-wip-us.apache.org/repos/asf/hadoop/blob/fa5cfc68/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuUtilizations.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuUtilizations.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuUtilizations.java
new file mode 100644
index 0000000..4ef218b
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuUtilizations.java
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+import javax.xml.bind.annotation.XmlElement;
+import javax.xml.bind.annotation.XmlRootElement;
+import javax.xml.bind.annotation.adapters.XmlJavaTypeAdapter;
+
+/**
+ * GPU utilizations
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+@XmlRootElement(name = "utilization")
+public class PerGpuUtilizations {
+  private float overallGpuUtilization;
+
+  /**
+   * Overall percent GPU utilization
+   * @return utilization
+   */
+  @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class)
+  @XmlElement(name = "gpu_util")
+  public Float getOverallGpuUtilization() {
+    return overallGpuUtilization;
+  }
+
+  public void setOverallGpuUtilization(Float overallGpuUtilization) {
+    this.overallGpuUtilization = overallGpuUtilization;
+  }
+}

http://git-wip-us.apache.org/repos/asf/hadoop/blob/fa5cfc68/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java
new file mode 100644
index 0000000..13b3ee9
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java
@@ -0,0 +1,164 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
+import org.apache.hadoop.net.ServerSocketUtil;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.event.Dispatcher;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.apache.hadoop.yarn.factories.RecordFactory;
+import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
+import org.apache.hadoop.yarn.server.api.ResourceTracker;
+import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest;
+import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse;
+import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest;
+import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerResponse;
+import org.apache.hadoop.yarn.server.api.protocolrecords.UnRegisterNodeManagerRequest;
+import org.apache.hadoop.yarn.server.api.protocolrecords.UnRegisterNodeManagerResponse;
+import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.NodeHeartbeatResponsePBImpl;
+import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.RegisterNodeManagerResponsePBImpl;
+import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.UnRegisterNodeManagerResponsePBImpl;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
+import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
+import org.junit.Assert;
+import org.junit.Before;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+
+public class NodeManagerTestBase {
+  // temp fix until metrics system can auto-detect itself running in unit test:
+  static {
+    DefaultMetricsSystem.setMiniClusterMode(true);
+  }
+
+  protected static final Logger LOG =
+      LoggerFactory.getLogger(TestNodeStatusUpdater.class);
+  protected static final File basedir =
+      new File("target", TestNodeStatusUpdater.class.getName());
+  protected static final File nmLocalDir = new File(basedir, "nm0");
+  protected static final File tmpDir = new File(basedir, "tmpDir");
+  protected static final File remoteLogsDir = new File(basedir, "remotelogs");
+  protected static final File logsDir = new File(basedir, "logs");
+  protected static final RecordFactory recordFactory = RecordFactoryProvider
+      .getRecordFactory(null);
+  protected Configuration conf;
+
+  protected YarnConfiguration createNMConfig() throws IOException {
+    return createNMConfig(ServerSocketUtil.getPort(49170, 10));
+  }
+
+  protected YarnConfiguration createNMConfig(int port) throws IOException {
+    YarnConfiguration conf = new YarnConfiguration();
+    String localhostAddress = null;
+    try {
+      localhostAddress = InetAddress.getByName("localhost")
+          .getCanonicalHostName();
+    } catch (UnknownHostException e) {
+      Assert.fail("Unable to get localhost address: " + e.getMessage());
+    }
+    conf.setInt(YarnConfiguration.NM_PMEM_MB, 5 * 1024); // 5GB
+    conf.set(YarnConfiguration.NM_ADDRESS, localhostAddress + ":" + port);
+    conf.set(YarnConfiguration.NM_LOCALIZER_ADDRESS, localhostAddress + ":"
+        + ServerSocketUtil.getPort(49160, 10));
+    conf.set(YarnConfiguration.NM_LOG_DIRS, logsDir.getAbsolutePath());
+    conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR,
+        remoteLogsDir.getAbsolutePath());
+    conf.set(YarnConfiguration.NM_LOCAL_DIRS, nmLocalDir.getAbsolutePath());
+    conf.setLong(YarnConfiguration.NM_LOG_RETAIN_SECONDS, 1);
+    return conf;
+  }
+
+  public static class BaseResourceTrackerForTest implements ResourceTracker {
+    @Override
+    public RegisterNodeManagerResponse registerNodeManager(
+        RegisterNodeManagerRequest request) throws YarnException, IOException {
+      return new RegisterNodeManagerResponsePBImpl();
+    }
+
+    @Override
+    public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request)
+        throws YarnException, IOException {
+      return new NodeHeartbeatResponsePBImpl();
+    }
+
+    @Override
+    public UnRegisterNodeManagerResponse unRegisterNodeManager(
+        UnRegisterNodeManagerRequest request)
+        throws YarnException, IOException {
+      return new UnRegisterNodeManagerResponsePBImpl();
+    }
+  }
+
+  protected static class BaseNodeStatusUpdaterForTest extends NodeStatusUpdaterImpl {
+    public ResourceTracker resourceTracker;
+    protected Context context;
+
+    public BaseNodeStatusUpdaterForTest(Context context, Dispatcher dispatcher,
+        NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics,
+        ResourceTracker resourceTracker) {
+      super(context, dispatcher, healthChecker, metrics);
+      this.context = context;
+      this.resourceTracker = resourceTracker;
+    }
+    @Override
+    protected ResourceTracker getRMClient() {
+      return resourceTracker;
+    }
+
+    @Override
+    protected void stopRMProxy() {
+      return;
+    }
+  }
+
+  public class MyContainerManager extends ContainerManagerImpl {
+    public boolean signaled = false;
+
+    public MyContainerManager(Context context, ContainerExecutor exec,
+        DeletionService deletionContext, NodeStatusUpdater nodeStatusUpdater,
+        NodeManagerMetrics metrics,
+        LocalDirsHandlerService dirsHandler) {
+      super(context, exec, deletionContext, nodeStatusUpdater,
+          metrics, dirsHandler);
+    }
+
+    @Override
+    public void handle(ContainerManagerEvent event) {
+      if (event.getType() == ContainerManagerEventType.SIGNAL_CONTAINERS) {
+        signaled = true;
+      }
+    }
+  }
+
+  @Before
+  public void setUp() throws IOException {
+    nmLocalDir.mkdirs();
+    tmpDir.mkdirs();
+    logsDir.mkdirs();
+    remoteLogsDir.mkdirs();
+    conf = createNMConfig();
+  }
+}

http://git-wip-us.apache.org/repos/asf/hadoop/blob/fa5cfc68/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java
index 2e9eff5..9b180c7 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java
@@ -178,7 +178,7 @@ public class TestDefaultContainerExecutor {
     FileContext lfs = FileContext.getLocalFSFileContext(conf);
     DefaultContainerExecutor executor = new DefaultContainerExecutor(lfs);
     executor.setConf(conf);
-    executor.init();
+    executor.init(null);
 
     try {
       executor.createUserLocalDirs(localDirs, user);
@@ -317,7 +317,7 @@ public class TestDefaultContainerExecutor {
       Path workDir = localDir;
       Path pidFile = new Path(workDir, "pid.txt");
 
-      mockExec.init();
+      mockExec.init(null);
       mockExec.activateContainer(cId, pidFile);
       int ret = mockExec.launchContainer(new ContainerStartContext.Builder()
           .setContainer(container)

http://git-wip-us.apache.org/repos/asf/hadoop/blob/fa5cfc68/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java
index d4db6b0..dcec4c3 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java
@@ -628,7 +628,7 @@ public class TestLinuxContainerExecutor {
     LinuxContainerExecutor lce = new LinuxContainerExecutor();
     lce.setConf(conf);
     try {
-      lce.init();
+      lce.init(null);
     } catch (IOException e) {
       // expected if LCE isn't setup right, but not necessary for this test
     }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/fa5cfc68/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java
index ad675b1..3dfa625 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java
@@ -415,7 +415,7 @@ public class TestLinuxContainerExecutorWithMocks {
   @Test
   public void testInit() throws Exception {
 
-    mockExec.init();
+    mockExec.init(mock(Context.class));
     assertEquals(Arrays.asList("--checksetup"), readMockParams());
     
   }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/fa5cfc68/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java
index 9279711..b31215b 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java
@@ -37,7 +37,7 @@ public class TestNodeManager {
   public static final class InvalidContainerExecutor extends
       DefaultContainerExecutor {
     @Override
-    public void init() throws IOException {
+    public void init(Context nmContext) throws IOException {
       throw new IOException("dummy executor init called");
     }
   }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/fa5cfc68/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java
index 11c3c35..8435340 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java
@@ -20,16 +20,14 @@ package org.apache.hadoop.yarn.server.nodemanager;
 
 import static org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils.newNodeHeartbeatResponse;
 import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.verify;
 import static org.mockito.Mockito.when;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 import java.io.EOFException;
 import java.io.File;
 import java.io.IOException;
 import java.net.InetAddress;
 import java.net.InetSocketAddress;
-import java.net.UnknownHostException;
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.Collections;
@@ -81,8 +79,6 @@ import org.apache.hadoop.yarn.event.Event;
 import org.apache.hadoop.yarn.event.EventHandler;
 import org.apache.hadoop.yarn.exceptions.YarnException;
 import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
-import org.apache.hadoop.yarn.factories.RecordFactory;
-import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
 import org.apache.hadoop.yarn.proto.YarnServerCommonServiceProtos.NodeHeartbeatResponseProto;
 import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
 import org.apache.hadoop.yarn.server.api.ResourceTracker;
@@ -118,41 +114,14 @@ import org.junit.Before;
 import org.junit.Test;
 
 @SuppressWarnings("rawtypes")
-public class TestNodeStatusUpdater {
-
-  // temp fix until metrics system can auto-detect itself running in unit test:
-  static {
-    DefaultMetricsSystem.setMiniClusterMode(true);
-  }
-
-  static final Logger LOG =
-       LoggerFactory.getLogger(TestNodeStatusUpdater.class);
-  static final File basedir =
-      new File("target", TestNodeStatusUpdater.class.getName());
-  static final File nmLocalDir = new File(basedir, "nm0");
-  static final File tmpDir = new File(basedir, "tmpDir");
-  static final File remoteLogsDir = new File(basedir, "remotelogs");
-  static final File logsDir = new File(basedir, "logs");
-  private static final RecordFactory recordFactory = RecordFactoryProvider
-      .getRecordFactory(null);
-
+public class TestNodeStatusUpdater extends NodeManagerTestBase {
   volatile int heartBeatID = 0;
   volatile Throwable nmStartError = null;
   private final List<NodeId> registeredNodes = new ArrayList<NodeId>();
   private boolean triggered = false;
-  private Configuration conf;
   private NodeManager nm;
   private AtomicBoolean assertionFailedInThread = new AtomicBoolean(false);
 
-  @Before
-  public void setUp() throws IOException {
-    nmLocalDir.mkdirs();
-    tmpDir.mkdirs();
-    logsDir.mkdirs();
-    remoteLogsDir.mkdirs();
-    conf = createNMConfig();
-  }
-
   @After
   public void tearDown() {
     this.registeredNodes.clear();
@@ -334,29 +303,7 @@ public class TestNodeStatusUpdater {
     }
   }
 
-  private class MyContainerManager extends ContainerManagerImpl {
-    public boolean signaled = false;
-
-    public MyContainerManager(Context context, ContainerExecutor exec,
-        DeletionService deletionContext, NodeStatusUpdater nodeStatusUpdater,
-        NodeManagerMetrics metrics,
-        LocalDirsHandlerService dirsHandler) {
-      super(context, exec, deletionContext, nodeStatusUpdater,
-          metrics, dirsHandler);
-    }
-
-    @Override
-    public void handle(ContainerManagerEvent event) {
-      if (event.getType() == ContainerManagerEventType.SIGNAL_CONTAINERS) {
-        signaled = true;
-      }
-    }
-  }
-
-  private class MyNodeStatusUpdater extends NodeStatusUpdaterImpl {
-    public ResourceTracker resourceTracker;
-    private Context context;
-
+  private class MyNodeStatusUpdater extends BaseNodeStatusUpdaterForTest {
     public MyNodeStatusUpdater(Context context, Dispatcher dispatcher,
         NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics) {
       this(context, dispatcher, healthChecker, metrics, false);
@@ -365,19 +312,8 @@ public class TestNodeStatusUpdater {
     public MyNodeStatusUpdater(Context context, Dispatcher dispatcher,
         NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics,
         boolean signalContainer) {
-      super(context, dispatcher, healthChecker, metrics);
-      this.context = context;
-      resourceTracker = new MyResourceTracker(this.context, signalContainer);
-    }
-
-    @Override
-    protected ResourceTracker getRMClient() {
-      return resourceTracker;
-    }
-
-    @Override
-    protected void stopRMProxy() {
-      return;
+      super(context, dispatcher, healthChecker, metrics,
+          new MyResourceTracker(context, signalContainer));
     }
   }
 
@@ -1820,7 +1756,6 @@ public class TestNodeStatusUpdater {
     Assert.assertTrue("Test failed with exception(s)" + exceptions,
         exceptions.isEmpty());
   }
-
   // Add new containers info into NM context each time node heart beats.
   private class MyNMContext extends NMContext {
 
@@ -1924,31 +1859,6 @@ public class TestNodeStatusUpdater {
         this.registeredNodes.size());
   }
 
-  private YarnConfiguration createNMConfig(int port) throws IOException {
-    YarnConfiguration conf = new YarnConfiguration();
-    String localhostAddress = null;
-    try {
-      localhostAddress = InetAddress.getByName("localhost")
-          .getCanonicalHostName();
-    } catch (UnknownHostException e) {
-      Assert.fail("Unable to get localhost address: " + e.getMessage());
-    }
-    conf.setInt(YarnConfiguration.NM_PMEM_MB, 5 * 1024); // 5GB
-    conf.set(YarnConfiguration.NM_ADDRESS, localhostAddress + ":" + port);
-    conf.set(YarnConfiguration.NM_LOCALIZER_ADDRESS, localhostAddress + ":"
-        + ServerSocketUtil.getPort(49160, 10));
-    conf.set(YarnConfiguration.NM_LOG_DIRS, logsDir.getAbsolutePath());
-    conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR,
-      remoteLogsDir.getAbsolutePath());
-    conf.set(YarnConfiguration.NM_LOCAL_DIRS, nmLocalDir.getAbsolutePath());
-    conf.setLong(YarnConfiguration.NM_LOG_RETAIN_SECONDS, 1);
-    return conf;
-  }
-
-  private YarnConfiguration createNMConfig() throws IOException {
-    return createNMConfig(ServerSocketUtil.getPort(49170, 10));
-  }
-
   private NodeManager getNodeManager(final NodeAction nodeHeartBeatAction) {
     return new NodeManager() {
       @Override

http://git-wip-us.apache.org/repos/asf/hadoop/blob/fa5cfc68/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java
index 0838f1e..3c57496 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java
@@ -18,26 +18,6 @@
 
 package org.apache.hadoop.yarn.server.nodemanager.amrmproxy;
 
-import java.io.IOException;
-import java.security.PrivilegedExceptionAction;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeSet;
-import java.util.concurrent.Callable;
-import java.util.concurrent.ConcurrentLinkedQueue;
-import java.util.concurrent.ConcurrentMap;
-import java.util.concurrent.ExecutorCompletionService;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.Future;
-import java.util.concurrent.TimeUnit;
-
-import org.apache.hadoop.yarn.server.nodemanager.ContainerStateTransitionListener;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.security.Credentials;
 import org.apache.hadoop.security.UserGroupInformation;
@@ -64,6 +44,7 @@ import org.apache.hadoop.yarn.server.api.protocolrecords.LogAggregationReport;
 import org.apache.hadoop.yarn.server.api.records.AppCollectorData;
 import org.apache.hadoop.yarn.server.api.records.NodeHealthStatus;
 import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.ContainerStateTransitionListener;
 import org.apache.hadoop.yarn.server.nodemanager.Context;
 import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
 import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext;
@@ -72,17 +53,36 @@ import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManager;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
 import org.apache.hadoop.yarn.server.nodemanager.recovery.NMMemoryStateStoreService;
 import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
-import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator;
 import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager;
 import org.apache.hadoop.yarn.server.nodemanager.security.NMTokenSecretManagerInNM;
 import org.apache.hadoop.yarn.server.nodemanager.timelineservice.NMTimelinePublisher;
+import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator;
 import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
 import org.apache.hadoop.yarn.util.Records;
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.security.PrivilegedExceptionAction;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ConcurrentLinkedQueue;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ExecutorCompletionService;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
 
 /**
  * Base class for all the AMRMProxyService test cases. It provides utility
@@ -773,5 +773,9 @@ public abstract class BaseAMRMProxyTest {
         getContainerStateTransitionListener() {
       return null;
     }
+
+    public ResourcePluginManager getResourcePluginManager() {
+      return null;
+    }
   }
 }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/fa5cfc68/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestResourceHandlerModule.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestResourceHandlerModule.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestResourceHandlerModule.java
index e5414a5..0563694 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestResourceHandlerModule.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestResourceHandlerModule.java
@@ -22,6 +22,7 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resourc
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.server.nodemanager.Context;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
@@ -30,6 +31,8 @@ import org.slf4j.LoggerFactory;
 
 import java.util.List;
 
+import static org.mockito.Mockito.mock;
+
 public class TestResourceHandlerModule {
   private static final Logger LOG =
        LoggerFactory.getLogger(TestResourceHandlerModule.class);
@@ -62,7 +65,7 @@ public class TestResourceHandlerModule {
 
       //Ensure that outbound bandwidth resource handler is present in the chain
       ResourceHandlerChain resourceHandlerChain = ResourceHandlerModule
-          .getConfiguredResourceHandlerChain(networkEnabledConf);
+          .getConfiguredResourceHandlerChain(networkEnabledConf, mock(Context.class));
       List<ResourceHandler> resourceHandlers = resourceHandlerChain
           .getResourceHandlerList();
       //Exactly one resource handler in chain
@@ -88,7 +91,8 @@ public class TestResourceHandlerModule {
     Assert.assertNotNull(handler);
 
     ResourceHandlerChain resourceHandlerChain =
-        ResourceHandlerModule.getConfiguredResourceHandlerChain(diskConf);
+        ResourceHandlerModule.getConfiguredResourceHandlerChain(diskConf,
+            mock(Context.class));
     List<ResourceHandler> resourceHandlers =
         resourceHandlerChain.getResourceHandlerList();
     // Exactly one resource handler in chain

http://git-wip-us.apache.org/repos/asf/hadoop/blob/fa5cfc68/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java
new file mode 100644
index 0000000..5c70f7a
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java
@@ -0,0 +1,382 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.yarn.api.protocolrecords.ResourceTypes;
+import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
+import org.apache.hadoop.yarn.api.records.ApplicationId;
+import org.apache.hadoop.yarn.api.records.ContainerId;
+import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
+import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.api.records.ResourceInformation;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.server.nodemanager.Context;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationException;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
+import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
+import org.apache.hadoop.yarn.util.resource.ResourceUtils;
+import org.apache.hadoop.yarn.util.resource.TestResourceUtils;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+
+import static org.mockito.Matchers.any;
+import static org.mockito.Matchers.anyList;
+import static org.mockito.Matchers.anyString;
+import static org.mockito.Matchers.eq;
+import static org.mockito.Mockito.doThrow;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.never;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
+
+public class TestGpuResourceHandler {
+  private CGroupsHandler mockCGroupsHandler;
+  private PrivilegedOperationExecutor mockPrivilegedExecutor;
+  private GpuResourceHandlerImpl gpuResourceHandler;
+  private NMStateStoreService mockNMStateStore;
+  private ConcurrentHashMap<ContainerId, Container> runningContainersMap;
+
+  @Before
+  public void setup() {
+    TestResourceUtils.addNewTypesToResources(ResourceInformation.GPU_URI);
+
+    mockCGroupsHandler = mock(CGroupsHandler.class);
+    mockPrivilegedExecutor = mock(PrivilegedOperationExecutor.class);
+    mockNMStateStore = mock(NMStateStoreService.class);
+
+    Context nmctx = mock(Context.class);
+    when(nmctx.getNMStateStore()).thenReturn(mockNMStateStore);
+    runningContainersMap = new ConcurrentHashMap<>();
+    when(nmctx.getContainers()).thenReturn(runningContainersMap);
+
+    gpuResourceHandler = new GpuResourceHandlerImpl(nmctx, mockCGroupsHandler,
+        mockPrivilegedExecutor);
+  }
+
+  @Test
+  public void testBootStrap() throws Exception {
+    Configuration conf = new YarnConfiguration();
+    conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0");
+
+    GpuDiscoverer.getInstance().initialize(conf);
+
+    gpuResourceHandler.bootstrap(conf);
+    verify(mockCGroupsHandler, times(1)).initializeCGroupController(
+        CGroupsHandler.CGroupController.DEVICES);
+  }
+
+  private static ContainerId getContainerId(int id) {
+    return ContainerId.newContainerId(ApplicationAttemptId
+        .newInstance(ApplicationId.newInstance(1234L, 1), 1), id);
+  }
+
+  private static Container mockContainerWithGpuRequest(int id,
+      int numGpuRequest) {
+    Container c = mock(Container.class);
+    when(c.getContainerId()).thenReturn(getContainerId(id));
+
+    Resource res = Resource.newInstance(1024, 1);
+    ResourceMappings resMapping = new ResourceMappings();
+
+    res.setResourceValue(ResourceInformation.GPU_URI, numGpuRequest);
+    when(c.getResource()).thenReturn(res);
+    when(c.getResourceMappings()).thenReturn(resMapping);
+    return c;
+  }
+
+  private void verifyDeniedDevices(ContainerId containerId,
+      List<Integer> deniedDevices)
+      throws ResourceHandlerException, PrivilegedOperationException {
+    verify(mockCGroupsHandler, times(1)).createCGroup(
+        CGroupsHandler.CGroupController.DEVICES, containerId.toString());
+
+    if (null != deniedDevices && !deniedDevices.isEmpty()) {
+      verify(mockPrivilegedExecutor, times(1)).executePrivilegedOperation(
+          new PrivilegedOperation(PrivilegedOperation.OperationType.GPU, Arrays
+              .asList(GpuResourceHandlerImpl.CONTAINER_ID_CLI_OPTION,
+                  containerId.toString(),
+                  GpuResourceHandlerImpl.EXCLUDED_GPUS_CLI_OPTION,
+                  StringUtils.join(",", deniedDevices))), true);
+    }
+  }
+
+  @Test
+  public void testAllocation() throws Exception {
+    Configuration conf = new YarnConfiguration();
+    conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
+    GpuDiscoverer.getInstance().initialize(conf);
+
+    gpuResourceHandler.bootstrap(conf);
+    Assert.assertEquals(4,
+        gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+
+    /* Start container 1, asks 3 containers */
+    gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 3));
+
+    // Only device=4 will be blocked.
+    verifyDeniedDevices(getContainerId(1), Arrays.asList(4));
+
+    /* Start container 2, asks 2 containers. Excepted to fail */
+    boolean failedToAllocate = false;
+    try {
+      gpuResourceHandler.preStart(mockContainerWithGpuRequest(2, 2));
+    } catch (ResourceHandlerException e) {
+      failedToAllocate = true;
+    }
+    Assert.assertTrue(failedToAllocate);
+
+    /* Start container 3, ask 1 container, succeeded */
+    gpuResourceHandler.preStart(mockContainerWithGpuRequest(3, 1));
+
+    // devices = 0/1/3 will be blocked
+    verifyDeniedDevices(getContainerId(3), Arrays.asList(0, 1, 3));
+
+    /* Start container 4, ask 0 container, succeeded */
+    gpuResourceHandler.preStart(mockContainerWithGpuRequest(4, 0));
+
+    // All devices will be blocked
+    verifyDeniedDevices(getContainerId(4), Arrays.asList(0, 1, 3, 4));
+
+    /* Release container-1, expect cgroups deleted */
+    gpuResourceHandler.postComplete(getContainerId(1));
+
+    verify(mockCGroupsHandler, times(1)).createCGroup(
+        CGroupsHandler.CGroupController.DEVICES, getContainerId(1).toString());
+    Assert.assertEquals(3,
+        gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+
+    /* Release container-3, expect cgroups deleted */
+    gpuResourceHandler.postComplete(getContainerId(3));
+
+    verify(mockCGroupsHandler, times(1)).createCGroup(
+        CGroupsHandler.CGroupController.DEVICES, getContainerId(3).toString());
+    Assert.assertEquals(4,
+        gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+  }
+
+  @SuppressWarnings("unchecked")
+  @Test
+  public void testAssignedGpuWillBeCleanedupWhenStoreOpFails()
+      throws Exception {
+    Configuration conf = new YarnConfiguration();
+    conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
+    GpuDiscoverer.getInstance().initialize(conf);
+
+    gpuResourceHandler.bootstrap(conf);
+    Assert.assertEquals(4,
+        gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+
+    doThrow(new IOException("Exception ...")).when(mockNMStateStore)
+        .storeAssignedResources(
+        any(ContainerId.class), anyString(), anyList());
+
+    boolean exception = false;
+    /* Start container 1, asks 3 containers */
+    try {
+      gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 3));
+    } catch (ResourceHandlerException e) {
+      exception = true;
+    }
+
+    Assert.assertTrue("preStart should throw exception", exception);
+
+    // After preStart, we still have 4 available GPU since the store op fails.
+    Assert.assertEquals(4,
+        gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+  }
+
+  @Test
+  public void testAllocationWithoutAllowedGpus() throws Exception {
+    Configuration conf = new YarnConfiguration();
+    conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " ");
+    GpuDiscoverer.getInstance().initialize(conf);
+
+    gpuResourceHandler.bootstrap(conf);
+    Assert.assertEquals(0,
+        gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+
+    /* Start container 1, asks 0 containers */
+    gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 0));
+    verifyDeniedDevices(getContainerId(1), Collections.emptyList());
+
+    /* Start container 2, asks 1 containers. Excepted to fail */
+    boolean failedToAllocate = false;
+    try {
+      gpuResourceHandler.preStart(mockContainerWithGpuRequest(2, 1));
+    } catch (ResourceHandlerException e) {
+      failedToAllocate = true;
+    }
+    Assert.assertTrue(failedToAllocate);
+
+    /* Release container 1, expect cgroups deleted */
+    gpuResourceHandler.postComplete(getContainerId(1));
+
+    verify(mockCGroupsHandler, times(1)).createCGroup(
+        CGroupsHandler.CGroupController.DEVICES, getContainerId(1).toString());
+    Assert.assertEquals(0,
+        gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+  }
+
+  @Test
+  public void testAllocationStored() throws Exception {
+    Configuration conf = new YarnConfiguration();
+    conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
+    GpuDiscoverer.getInstance().initialize(conf);
+
+    gpuResourceHandler.bootstrap(conf);
+    Assert.assertEquals(4,
+        gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+
+    /* Start container 1, asks 3 containers */
+    Container container = mockContainerWithGpuRequest(1, 3);
+    gpuResourceHandler.preStart(container);
+
+    verify(mockNMStateStore).storeAssignedResources(getContainerId(1),
+        ResourceInformation.GPU_URI,
+        Arrays.asList("0", "1", "3"));
+
+    Assert.assertEquals(3, container.getResourceMappings()
+        .getAssignedResources(ResourceInformation.GPU_URI).size());
+
+    // Only device=4 will be blocked.
+    verifyDeniedDevices(getContainerId(1), Arrays.asList(4));
+
+    /* Start container 2, ask 0 container, succeeded */
+    container = mockContainerWithGpuRequest(2, 0);
+    gpuResourceHandler.preStart(container);
+
+    verifyDeniedDevices(getContainerId(2), Arrays.asList(0, 1, 3, 4));
+    Assert.assertEquals(0, container.getResourceMappings()
+        .getAssignedResources(ResourceInformation.GPU_URI).size());
+
+    // Store assigned resource will not be invoked.
+    verify(mockNMStateStore, never()).storeAssignedResources(
+        eq(getContainerId(2)), eq(ResourceInformation.GPU_URI), anyList());
+  }
+
+  @Test
+  public void testRecoverResourceAllocation() throws Exception {
+    Configuration conf = new YarnConfiguration();
+    conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
+    GpuDiscoverer.getInstance().initialize(conf);
+
+    gpuResourceHandler.bootstrap(conf);
+    Assert.assertEquals(4,
+        gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+
+    Container nmContainer = mock(Container.class);
+    ResourceMappings rmap = new ResourceMappings();
+    ResourceMappings.AssignedResources ar =
+        new ResourceMappings.AssignedResources();
+    ar.updateAssignedResources(Arrays.asList("1", "3"));
+    rmap.addAssignedResources(ResourceInformation.GPU_URI, ar);
+    when(nmContainer.getResourceMappings()).thenReturn(rmap);
+
+    runningContainersMap.put(getContainerId(1), nmContainer);
+
+    // TEST CASE
+    // Reacquire container restore state of GPU Resource Allocator.
+    gpuResourceHandler.reacquireContainer(getContainerId(1));
+
+    Map<Integer, ContainerId> deviceAllocationMapping =
+        gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping();
+    Assert.assertEquals(2, deviceAllocationMapping.size());
+    Assert.assertTrue(
+        deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3)));
+    Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1));
+
+    // TEST CASE
+    // Try to reacquire a container but requested device is not in allowed list.
+    nmContainer = mock(Container.class);
+    rmap = new ResourceMappings();
+    ar = new ResourceMappings.AssignedResources();
+    // id=5 is not in allowed list.
+    ar.updateAssignedResources(Arrays.asList("4", "5"));
+    rmap.addAssignedResources(ResourceInformation.GPU_URI, ar);
+    when(nmContainer.getResourceMappings()).thenReturn(rmap);
+
+    runningContainersMap.put(getContainerId(2), nmContainer);
+
+    boolean caughtException = false;
+    try {
+      gpuResourceHandler.reacquireContainer(getContainerId(1));
+    } catch (ResourceHandlerException e) {
+      caughtException = true;
+    }
+    Assert.assertTrue(
+        "Should fail since requested device Id is not in allowed list",
+        caughtException);
+
+    // Make sure internal state not changed.
+    deviceAllocationMapping =
+        gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping();
+    Assert.assertEquals(2, deviceAllocationMapping.size());
+    Assert.assertTrue(
+        deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3)));
+    Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1));
+
+    // TEST CASE
+    // Try to reacquire a container but requested device is already assigned.
+    nmContainer = mock(Container.class);
+    rmap = new ResourceMappings();
+    ar = new ResourceMappings.AssignedResources();
+    // id=3 is already assigned
+    ar.updateAssignedResources(Arrays.asList("4", "3"));
+    rmap.addAssignedResources("gpu", ar);
+    when(nmContainer.getResourceMappings()).thenReturn(rmap);
+
+    runningContainersMap.put(getContainerId(2), nmContainer);
+
+    caughtException = false;
+    try {
+      gpuResourceHandler.reacquireContainer(getContainerId(1));
+    } catch (ResourceHandlerException e) {
+      caughtException = true;
+    }
+    Assert.assertTrue(
+        "Should fail since requested device Id is not in allowed list",
+        caughtException);
+
+    // Make sure internal state not changed.
+    deviceAllocationMapping =
+        gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping();
+    Assert.assertEquals(2, deviceAllocationMapping.size());
+    Assert.assertTrue(
+        deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3)));
+    Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1));
+  }
+}

http://git-wip-us.apache.org/repos/asf/hadoop/blob/fa5cfc68/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java
index e21eea0..2cca277 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java
@@ -73,7 +73,7 @@ public class TestContainersMonitorResourceChange {
 
   private static class MockExecutor extends ContainerExecutor {
     @Override
-    public void init() throws IOException {
+    public void init(Context nmContext) throws IOException {
     }
     @Override
     public void startLocalizer(LocalizerStartContext ctx)

http://git-wip-us.apache.org/repos/asf/hadoop/blob/fa5cfc68/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/TestResourcePluginManager.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/TestResourcePluginManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/TestResourcePluginManager.java
new file mode 100644
index 0000000..bcadf76
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/TestResourcePluginManager.java
@@ -0,0 +1,261 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.service.ServiceOperations;
+import org.apache.hadoop.yarn.api.records.ContainerId;
+import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.event.Dispatcher;
+import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.Context;
+import org.apache.hadoop.yarn.server.nodemanager.DeletionService;
+import org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
+import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService;
+import org.apache.hadoop.yarn.server.nodemanager.NodeManager;
+import org.apache.hadoop.yarn.server.nodemanager.NodeManagerTestBase;
+import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerChain;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.NodeResourceUpdaterPlugin;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
+import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import static org.mockito.Matchers.any;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
+
+public class TestResourcePluginManager extends NodeManagerTestBase {
+  private NodeManager nm;
+
+  ResourcePluginManager stubResourcePluginmanager() {
+    // Stub ResourcePluginManager
+    final ResourcePluginManager rpm = mock(ResourcePluginManager.class);
+    Map<String, ResourcePlugin> plugins = new HashMap<>();
+
+    // First resource plugin
+    ResourcePlugin resourcePlugin = mock(ResourcePlugin.class);
+    NodeResourceUpdaterPlugin nodeResourceUpdaterPlugin = mock(
+        NodeResourceUpdaterPlugin.class);
+    when(resourcePlugin.getNodeResourceHandlerInstance()).thenReturn(
+        nodeResourceUpdaterPlugin);
+    plugins.put("resource1", resourcePlugin);
+
+    // Second resource plugin
+    resourcePlugin = mock(ResourcePlugin.class);
+    when(resourcePlugin.createResourceHandler(any(Context.class), any(
+        CGroupsHandler.class), any(PrivilegedOperationExecutor.class)))
+        .thenReturn(new CustomizedResourceHandler());
+    plugins.put("resource2", resourcePlugin);
+    when(rpm.getNameToPlugins()).thenReturn(plugins);
+    return rpm;
+  }
+
+  @After
+  public void tearDown() {
+    if (nm != null) {
+      try {
+        ServiceOperations.stop(nm);
+      } catch (Throwable t) {
+        // ignore
+      }
+    }
+  }
+
+  private class CustomizedResourceHandler implements ResourceHandler {
+
+    @Override
+    public List<PrivilegedOperation> bootstrap(Configuration configuration)
+        throws ResourceHandlerException {
+      return null;
+    }
+
+    @Override
+    public List<PrivilegedOperation> preStart(Container container)
+        throws ResourceHandlerException {
+      return null;
+    }
+
+    @Override
+    public List<PrivilegedOperation> reacquireContainer(ContainerId containerId)
+        throws ResourceHandlerException {
+      return null;
+    }
+
+    @Override
+    public List<PrivilegedOperation> postComplete(ContainerId containerId)
+        throws ResourceHandlerException {
+      return null;
+    }
+
+    @Override
+    public List<PrivilegedOperation> teardown()
+        throws ResourceHandlerException {
+      return null;
+    }
+  }
+
+  private class MyMockNM extends NodeManager {
+    private final ResourcePluginManager rpm;
+
+    public MyMockNM(ResourcePluginManager rpm) {
+      this.rpm = rpm;
+    }
+
+    @Override
+    protected NodeStatusUpdater createNodeStatusUpdater(Context context,
+        Dispatcher dispatcher, NodeHealthCheckerService healthChecker) {
+      ((NodeManager.NMContext)context).setResourcePluginManager(rpm);
+      return new BaseNodeStatusUpdaterForTest(context, dispatcher, healthChecker,
+          metrics, new BaseResourceTrackerForTest());
+    }
+
+    @Override
+    protected ContainerManagerImpl createContainerManager(Context context,
+        ContainerExecutor exec, DeletionService del,
+        NodeStatusUpdater nodeStatusUpdater,
+        ApplicationACLsManager aclsManager,
+        LocalDirsHandlerService diskhandler) {
+      return new MyContainerManager(context, exec, del, nodeStatusUpdater,
+      metrics, diskhandler);
+    }
+
+    @Override
+    protected ResourcePluginManager createResourcePluginManager() {
+      return rpm;
+    }
+  }
+
+  public class MyLCE extends LinuxContainerExecutor {
+    private PrivilegedOperationExecutor poe = mock(PrivilegedOperationExecutor.class);
+
+    @Override
+    protected PrivilegedOperationExecutor getPrivilegedOperationExecutor() {
+      return poe;
+    }
+  }
+
+  /*
+   * Make sure ResourcePluginManager is initialized during NM start up.
+   */
+  @Test(timeout = 30000)
+  public void testResourcePluginManagerInitialization() throws Exception {
+    final ResourcePluginManager rpm = stubResourcePluginmanager();
+    nm = new MyMockNM(rpm);
+
+    YarnConfiguration conf = createNMConfig();
+    nm.init(conf);
+    verify(rpm, times(1)).initialize(
+        any(Context.class));
+  }
+
+  /*
+   * Make sure ResourcePluginManager is invoked during NM update.
+   */
+  @Test(timeout = 30000)
+  public void testNodeStatusUpdaterWithResourcePluginsEnabled() throws Exception {
+    final ResourcePluginManager rpm = stubResourcePluginmanager();
+
+    nm = new MyMockNM(rpm);
+
+    YarnConfiguration conf = createNMConfig();
+    nm.init(conf);
+    nm.start();
+
+    NodeResourceUpdaterPlugin nodeResourceUpdaterPlugin =
+        rpm.getNameToPlugins().get("resource1")
+            .getNodeResourceHandlerInstance();
+
+    verify(nodeResourceUpdaterPlugin, times(1)).updateConfiguredResource(
+        any(Resource.class));
+  }
+
+  /*
+   * Make sure ResourcePluginManager is used to initialize ResourceHandlerChain
+   */
+  @Test(timeout = 30000)
+  public void testLinuxContainerExecutorWithResourcePluginsEnabled() throws Exception {
+    final ResourcePluginManager rpm = stubResourcePluginmanager();
+    final LinuxContainerExecutor lce = new MyLCE();
+
+    nm = new NodeManager() {
+      @Override
+      protected NodeStatusUpdater createNodeStatusUpdater(Context context,
+          Dispatcher dispatcher, NodeHealthCheckerService healthChecker) {
+        ((NMContext)context).setResourcePluginManager(rpm);
+        return new BaseNodeStatusUpdaterForTest(context, dispatcher, healthChecker,
+            metrics, new BaseResourceTrackerForTest());
+      }
+
+      @Override
+      protected ContainerManagerImpl createContainerManager(Context context,
+          ContainerExecutor exec, DeletionService del,
+          NodeStatusUpdater nodeStatusUpdater,
+          ApplicationACLsManager aclsManager,
+          LocalDirsHandlerService diskhandler) {
+        return new MyContainerManager(context, exec, del, nodeStatusUpdater,
+            metrics, diskhandler);
+      }
+
+      @Override
+      protected ContainerExecutor createContainerExecutor(Configuration conf) {
+        ((NMContext)this.getNMContext()).setResourcePluginManager(rpm);
+        lce.setConf(conf);
+        return lce;
+      }
+    };
+
+    YarnConfiguration conf = createNMConfig();
+
+    nm.init(conf);
+    nm.start();
+
+    ResourceHandler handler = lce.getResourceHandler();
+    Assert.assertNotNull(handler);
+    Assert.assertTrue(handler instanceof ResourceHandlerChain);
+
+    boolean newHandlerAdded = false;
+    for (ResourceHandler h : ((ResourceHandlerChain) handler)
+        .getResourceHandlerList()) {
+      if (h instanceof CustomizedResourceHandler) {
+        newHandlerAdded = true;
+        break;
+      }
+    }
+    Assert.assertTrue("New ResourceHandler should be added", newHandlerAdded);
+  }
+}

http://git-wip-us.apache.org/repos/asf/hadoop/blob/fa5cfc68/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
new file mode 100644
index 0000000..83bace2
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
@@ -0,0 +1,123 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
+import org.junit.Assert;
+import org.junit.Assume;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.List;
+
+public class TestGpuDiscoverer {
+  private String getTestParentFolder() {
+    File f = new File("target/temp/" + TestGpuDiscoverer.class.getName());
+    return f.getAbsolutePath();
+  }
+
+  private void touchFile(File f) throws IOException {
+    new FileOutputStream(f).close();
+  }
+
+  @Before
+  public void before() throws IOException {
+    String folder = getTestParentFolder();
+    File f = new File(folder);
+    FileUtils.deleteDirectory(f);
+    f.mkdirs();
+  }
+
+  @Test
+  public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception {
+    // Only run this on demand.
+    Assume.assumeTrue(Boolean.valueOf(
+        System.getProperty("RunLinuxGpuResourceDiscoverPluginConfigTest")));
+
+    // test case 1, check default setting.
+    Configuration conf = new Configuration(false);
+    GpuDiscoverer plugin = new GpuDiscoverer();
+    plugin.initialize(conf);
+    Assert.assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME,
+        plugin.getPathOfGpuBinary());
+    Assert.assertNotNull(plugin.getEnvironmentToRunCommand().get("PATH"));
+    Assert.assertTrue(
+        plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia"));
+
+    // test case 2, check mandatory set path.
+    File fakeBinary = new File(getTestParentFolder(),
+        GpuDiscoverer.DEFAULT_BINARY_NAME);
+    touchFile(fakeBinary);
+    conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder());
+    plugin = new GpuDiscoverer();
+    plugin.initialize(conf);
+    Assert.assertEquals(fakeBinary.getAbsolutePath(),
+        plugin.getPathOfGpuBinary());
+    Assert.assertNull(plugin.getEnvironmentToRunCommand().get("PATH"));
+
+    // test case 3, check mandatory set path, but binary doesn't exist so default
+    // path will be used.
+    fakeBinary.delete();
+    plugin = new GpuDiscoverer();
+    plugin.initialize(conf);
+    Assert.assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME,
+        plugin.getPathOfGpuBinary());
+    Assert.assertTrue(
+        plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia"));
+  }
+
+  @Test
+  public void testGpuDiscover() throws YarnException {
+    // Since this is more of a performance unit test, only run if
+    // RunUserLimitThroughput is set (-DRunUserLimitThroughput=true)
+    Assume.assumeTrue(
+        Boolean.valueOf(System.getProperty("runGpuDiscoverUnitTest")));
+    Configuration conf = new Configuration(false);
+    GpuDiscoverer plugin = new GpuDiscoverer();
+    plugin.initialize(conf);
+    GpuDeviceInformation info = plugin.getGpuDeviceInformation();
+
+    Assert.assertTrue(info.getGpus().size() > 0);
+    Assert.assertEquals(plugin.getMinorNumbersOfGpusUsableByYarn().size(),
+        info.getGpus().size());
+  }
+
+  @Test
+  public void getNumberOfUsableGpusFromConfig() throws YarnException {
+    Configuration conf = new Configuration(false);
+    conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,2,4");
+    GpuDiscoverer plugin = new GpuDiscoverer();
+    plugin.initialize(conf);
+
+    List<Integer> minorNumbers = plugin.getMinorNumbersOfGpusUsableByYarn();
+    Assert.assertEquals(4, minorNumbers.size());
+
+    Assert.assertTrue(0 == minorNumbers.get(0));
+    Assert.assertTrue(1 == minorNumbers.get(1));
+    Assert.assertTrue(2 == minorNumbers.get(2));
+    Assert.assertTrue(4 == minorNumbers.get(3));
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org


Mime
View raw message