From common-commits-return-82273-archive-asf-public=cust-asf.ponee.io@hadoop.apache.org Wed May 2 22:40:49 2018 Return-Path: X-Original-To: archive-asf-public@cust-asf.ponee.io Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by mx-eu-01.ponee.io (Postfix) with SMTP id 03AC5180771 for ; Wed, 2 May 2018 22:40:46 +0200 (CEST) Received: (qmail 37209 invoked by uid 500); 2 May 2018 20:40:34 -0000 Mailing-List: contact common-commits-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Delivered-To: mailing list common-commits@hadoop.apache.org Received: (qmail 35844 invoked by uid 99); 2 May 2018 20:40:33 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 02 May 2018 20:40:33 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 15EDAE9641; Wed, 2 May 2018 20:40:33 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: shv@apache.org To: common-commits@hadoop.apache.org Date: Wed, 02 May 2018 20:41:01 -0000 Message-Id: In-Reply-To: References: X-Mailer: ASF-Git Admin Mailer Subject: [30/51] [abbrv] hadoop git commit: YARN-7137. [YARN-3926] Move newly added APIs to unstable in YARN-3926 branch. Contributed by Wangda Tan. YARN-7137. [YARN-3926] Move newly added APIs to unstable in YARN-3926 branch. Contributed by Wangda Tan. (cherry picked from commit da0b6a354bf6f6bf37ca5a05a4a8eece09aa4893) Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/15319c79 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/15319c79 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/15319c79 Branch: refs/heads/YARN-8200 Commit: 15319c79a6e67073246b6efff7527b6879a40fc7 Parents: 0406adc Author: Sunil G Authored: Tue Sep 12 20:31:47 2017 +0530 Committer: Konstantin V Shvachko Committed: Wed May 2 13:19:58 2018 -0700 ---------------------------------------------------------------------- .../hadoop/yarn/api/records/Resource.java | 24 +- .../yarn/api/records/ResourceInformation.java | 12 +- .../yarn/api/records/ResourceRequest.java | 1 + .../hadoop/yarn/conf/YarnConfiguration.java | 33 ++ .../yarn/util/resource/ResourceUtils.java | 70 +-- .../hadoop/yarn/util/resource/package-info.java | 6 +- .../src/main/resources/yarn-default.xml | 48 +- .../yarn/util/resource/TestResourceUtils.java | 17 + .../server/nodemanager/ContainerExecutor.java | 3 +- .../hadoop/yarn/server/nodemanager/Context.java | 3 + .../nodemanager/DefaultContainerExecutor.java | 2 +- .../nodemanager/LinuxContainerExecutor.java | 10 +- .../yarn/server/nodemanager/NodeManager.java | 92 ++-- .../nodemanager/NodeStatusUpdaterImpl.java | 38 +- .../linux/privileged/PrivilegedOperation.java | 1 + .../linux/resources/ResourceHandlerChain.java | 4 +- .../linux/resources/ResourceHandlerModule.java | 42 +- .../resources/gpu/GpuResourceAllocator.java | 242 ++++++++ .../resources/gpu/GpuResourceHandlerImpl.java | 153 ++++++ .../NodeResourceUpdaterPlugin.java | 52 ++ .../resourceplugin/ResourcePlugin.java | 83 +++ .../resourceplugin/ResourcePluginManager.java | 106 ++++ .../resourceplugin/gpu/GpuDiscoverer.java | 254 +++++++++ .../gpu/GpuNodeResourceUpdateHandler.java | 66 +++ .../resourceplugin/gpu/GpuResourcePlugin.java | 61 +++ .../webapp/dao/gpu/GpuDeviceInformation.java | 72 +++ .../dao/gpu/GpuDeviceInformationParser.java | 87 +++ .../webapp/dao/gpu/PerGpuDeviceInformation.java | 165 ++++++ .../webapp/dao/gpu/PerGpuMemoryUsage.java | 58 ++ .../webapp/dao/gpu/PerGpuTemperature.java | 80 +++ .../webapp/dao/gpu/PerGpuUtilizations.java | 50 ++ .../server/nodemanager/NodeManagerTestBase.java | 164 ++++++ .../TestDefaultContainerExecutor.java | 4 +- .../nodemanager/TestLinuxContainerExecutor.java | 2 +- .../TestLinuxContainerExecutorWithMocks.java | 2 +- .../server/nodemanager/TestNodeManager.java | 2 +- .../nodemanager/TestNodeStatusUpdater.java | 100 +--- .../amrmproxy/BaseAMRMProxyTest.java | 46 +- .../resources/TestResourceHandlerModule.java | 8 +- .../resources/gpu/TestGpuResourceHandler.java | 382 +++++++++++++ .../TestContainersMonitorResourceChange.java | 2 +- .../TestResourcePluginManager.java | 261 +++++++++ .../resourceplugin/gpu/TestGpuDiscoverer.java | 123 +++++ .../dao/gpu/TestGpuDeviceInformationParser.java | 50 ++ .../test/resources/nvidia-smi-sample-xml-output | 547 +++++++++++++++++++ .../resourcemanager/webapp/dao/AppInfo.java | 2 +- 46 files changed, 3385 insertions(+), 245 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/15319c79/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/Resource.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/Resource.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/Resource.java index 37b50f2..9a5bc79 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/Resource.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/Resource.java @@ -206,8 +206,8 @@ public abstract class Resource implements Comparable { * * @return Map of resource name to ResourceInformation */ - @Public - @Evolving + @InterfaceAudience.Private + @InterfaceStability.Unstable public ResourceInformation[] getResources() { return resources; } @@ -220,7 +220,7 @@ public abstract class Resource implements Comparable { * @throws ResourceNotFoundException if the resource can't be found */ @Public - @Evolving + @InterfaceStability.Unstable public ResourceInformation getResourceInformation(String resource) throws ResourceNotFoundException { Integer index = ResourceUtils.getResourceTypeIndex().get(resource); @@ -240,8 +240,8 @@ public abstract class Resource implements Comparable { * @throws ResourceNotFoundException * if the resource can't be found */ - @Public - @Evolving + @InterfaceAudience.Private + @InterfaceStability.Unstable public ResourceInformation getResourceInformation(int index) throws ResourceNotFoundException { ResourceInformation ri = null; @@ -262,7 +262,7 @@ public abstract class Resource implements Comparable { * @throws ResourceNotFoundException if the resource can't be found */ @Public - @Evolving + @InterfaceStability.Unstable public long getResourceValue(String resource) throws ResourceNotFoundException { return getResourceInformation(resource).getValue(); @@ -276,7 +276,7 @@ public abstract class Resource implements Comparable { * @throws ResourceNotFoundException if the resource is not found */ @Public - @Evolving + @InterfaceStability.Unstable public void setResourceInformation(String resource, ResourceInformation resourceInformation) throws ResourceNotFoundException { @@ -302,8 +302,8 @@ public abstract class Resource implements Comparable { * @throws ResourceNotFoundException * if the resource is not found */ - @Public - @Evolving + @InterfaceAudience.Private + @InterfaceStability.Unstable public void setResourceInformation(int index, ResourceInformation resourceInformation) throws ResourceNotFoundException { @@ -323,7 +323,7 @@ public abstract class Resource implements Comparable { * @throws ResourceNotFoundException if the resource is not found */ @Public - @Evolving + @InterfaceStability.Unstable public void setResourceValue(String resource, long value) throws ResourceNotFoundException { if (resource.equals(ResourceInformation.MEMORY_URI)) { @@ -350,8 +350,8 @@ public abstract class Resource implements Comparable { * @throws ResourceNotFoundException * if the resource is not found */ - @Public - @Evolving + @InterfaceAudience.Private + @InterfaceStability.Unstable public void setResourceValue(int index, long value) throws ResourceNotFoundException { try { http://git-wip-us.apache.org/repos/asf/hadoop/blob/15319c79/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ResourceInformation.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ResourceInformation.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ResourceInformation.java index 785b311..84b4748 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ResourceInformation.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ResourceInformation.java @@ -18,11 +18,14 @@ package org.apache.hadoop.yarn.api.records; -import org.apache.curator.shaded.com.google.common.reflect.ClassPath; +import com.google.common.collect.ImmutableMap; + import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.yarn.api.protocolrecords.ResourceTypes; import org.apache.hadoop.yarn.util.UnitsConversionUtil; +import java.util.Map; + /** * Class to encapsulate information about a Resource - the name of the resource, * the units(milli, micro, etc), the type(countable), and the value. @@ -36,13 +39,20 @@ public class ResourceInformation implements Comparable { private long minimumAllocation; private long maximumAllocation; + // Known resource types public static final String MEMORY_URI = "memory-mb"; public static final String VCORES_URI = "vcores"; + public static final String GPU_URI = "yarn.io/gpu"; public static final ResourceInformation MEMORY_MB = ResourceInformation.newInstance(MEMORY_URI, "Mi"); public static final ResourceInformation VCORES = ResourceInformation.newInstance(VCORES_URI); + public static final ResourceInformation GPUS = + ResourceInformation.newInstance(GPU_URI); + + public static final Map MANDATORY_RESOURCES = + ImmutableMap.of(MEMORY_URI, MEMORY_MB, VCORES_URI, VCORES, GPU_URI, GPUS); /** * Get the name for the resource. http://git-wip-us.apache.org/repos/asf/hadoop/blob/15319c79/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ResourceRequest.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ResourceRequest.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ResourceRequest.java index e9be6c3..43a339c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ResourceRequest.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ResourceRequest.java @@ -21,6 +21,7 @@ package org.apache.hadoop.yarn.api.records; import java.io.Serializable; import org.apache.hadoop.classification.InterfaceAudience.Public; +import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.classification.InterfaceStability.Evolving; import org.apache.hadoop.classification.InterfaceStability.Stable; import org.apache.hadoop.classification.InterfaceStability.Unstable; http://git-wip-us.apache.org/repos/asf/hadoop/blob/15319c79/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index ec1869f..63e46d9 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -1411,6 +1411,39 @@ public class YarnConfiguration extends Configuration { public static final String NM_NETWORK_RESOURCE_OUTBOUND_BANDWIDTH_YARN_MBIT = NM_NETWORK_RESOURCE_PREFIX + "outbound-bandwidth-yarn-mbit"; + /** + * Prefix for computation resources, example of computation resources like + * GPU / FPGA / TPU, etc. + */ + @Private + public static final String NM_RESOURCE_PLUGINS = + NM_PREFIX + "resource-plugins"; + + /** + * Prefix for gpu configurations. Work in progress: This configuration + * parameter may be changed/removed in the future. + */ + @Private + public static final String NM_GPU_RESOURCE_PREFIX = + NM_RESOURCE_PLUGINS + ".gpu."; + + @Private + public static final String NM_GPU_ALLOWED_DEVICES = + NM_GPU_RESOURCE_PREFIX + "allowed-gpu-devices"; + @Private + public static final String AUTOMATICALLY_DISCOVER_GPU_DEVICES = "auto"; + + /** + * This setting controls where to how to invoke GPU binaries + */ + @Private + public static final String NM_GPU_PATH_TO_EXEC = + NM_GPU_RESOURCE_PREFIX + "path-to-discovery-executables"; + + @Private + public static final String DEFAULT_NM_GPU_PATH_TO_EXEC = ""; + + /** NM Webapp address.**/ public static final String NM_WEBAPP_ADDRESS = NM_PREFIX + "webapp.address"; public static final int DEFAULT_NM_WEBAPP_PORT = 8042; http://git-wip-us.apache.org/repos/asf/hadoop/blob/15319c79/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/util/resource/ResourceUtils.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/util/resource/ResourceUtils.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/util/resource/ResourceUtils.java index 110453a..f3edc74 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/util/resource/ResourceUtils.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/util/resource/ResourceUtils.java @@ -46,11 +46,11 @@ import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; +import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI; + /** * Helper class to read the resource-types to be supported by the system. */ -@InterfaceAudience.Public -@InterfaceStability.Unstable public class ResourceUtils { public static final String UNITS = ".units"; @@ -65,7 +65,6 @@ public class ResourceUtils { private static final Map RESOURCE_NAME_TO_INDEX = new ConcurrentHashMap(); private static volatile Map resourceTypes; - private static volatile String[] resourceNamesArray; private static volatile ResourceInformation[] resourceTypesArray; private static volatile boolean initializedNodeResources = false; private static volatile Map readOnlyNodeResources; @@ -85,33 +84,32 @@ public class ResourceUtils { */ String key = "memory"; if (resourceInformationMap.containsKey(key)) { - LOG.warn("Attempt to define resource '" + key + - "', but it is not allowed."); - throw new YarnRuntimeException("Attempt to re-define mandatory resource '" - + key + "'."); - } - - if (resourceInformationMap.containsKey(MEMORY)) { - ResourceInformation memInfo = resourceInformationMap.get(MEMORY); - String memUnits = ResourceInformation.MEMORY_MB.getUnits(); - ResourceTypes memType = ResourceInformation.MEMORY_MB.getResourceType(); - if (!memInfo.getUnits().equals(memUnits) || !memInfo.getResourceType() - .equals(memType)) { - throw new YarnRuntimeException( - "Attempt to re-define mandatory resource 'memory-mb'. It can only" - + " be of type 'COUNTABLE' and have units 'Mi'."); - } + LOG.warn( + "Attempt to define resource '" + key + "', but it is not allowed."); + throw new YarnRuntimeException( + "Attempt to re-define mandatory resource '" + key + "'."); } - if (resourceInformationMap.containsKey(VCORES)) { - ResourceInformation vcoreInfo = resourceInformationMap.get(VCORES); - String vcoreUnits = ResourceInformation.VCORES.getUnits(); - ResourceTypes vcoreType = ResourceInformation.VCORES.getResourceType(); - if (!vcoreInfo.getUnits().equals(vcoreUnits) || !vcoreInfo - .getResourceType().equals(vcoreType)) { - throw new YarnRuntimeException( - "Attempt to re-define mandatory resource 'vcores'. It can only be" - + " of type 'COUNTABLE' and have units ''(no units)."); + for (Map.Entry mandatoryResourceEntry : + ResourceInformation.MANDATORY_RESOURCES.entrySet()) { + key = mandatoryResourceEntry.getKey(); + ResourceInformation mandatoryRI = mandatoryResourceEntry.getValue(); + + ResourceInformation newDefinedRI = resourceInformationMap.get(key); + if (newDefinedRI != null) { + String expectedUnit = mandatoryRI.getUnits(); + ResourceTypes expectedType = mandatoryRI.getResourceType(); + String actualUnit = newDefinedRI.getUnits(); + ResourceTypes actualType = newDefinedRI.getResourceType(); + + if (!expectedUnit.equals(actualUnit) || !expectedType.equals( + actualType)) { + throw new YarnRuntimeException("Defined mandatory resource type=" + + key + " inside resource-types.xml, however its type or " + + "unit is conflict to mandatory resource types, expected type=" + + expectedType + ", unit=" + expectedUnit + "; actual type=" + + actualType + " actual unit=" + actualUnit); + } } } } @@ -270,7 +268,6 @@ public class ResourceUtils { private static void updateKnownResources() { // Update resource names. - resourceNamesArray = new String[resourceTypes.size()]; resourceTypesArray = new ResourceInformation[resourceTypes.size()]; int index = 2; @@ -278,14 +275,11 @@ public class ResourceUtils { if (resInfo.getName().equals(MEMORY)) { resourceTypesArray[0] = ResourceInformation .newInstance(resourceTypes.get(MEMORY)); - resourceNamesArray[0] = MEMORY; } else if (resInfo.getName().equals(VCORES)) { resourceTypesArray[1] = ResourceInformation .newInstance(resourceTypes.get(VCORES)); - resourceNamesArray[1] = VCORES; } else { resourceTypesArray[index] = ResourceInformation.newInstance(resInfo); - resourceNamesArray[index] = resInfo.getName(); index++; } } @@ -319,18 +313,6 @@ public class ResourceUtils { YarnConfiguration.RESOURCE_TYPES_CONFIGURATION_FILE); } - /** - * Get resource names array, this is mostly for performance perspective. Never - * modify returned array. - * - * @return resourceNamesArray - */ - public static String[] getResourceNamesArray() { - initializeResourceTypesIfNeeded(null, - YarnConfiguration.RESOURCE_TYPES_CONFIGURATION_FILE); - return resourceNamesArray; - } - public static ResourceInformation[] getResourceTypesArray() { initializeResourceTypesIfNeeded(null, YarnConfiguration.RESOURCE_TYPES_CONFIGURATION_FILE); http://git-wip-us.apache.org/repos/asf/hadoop/blob/15319c79/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/util/resource/package-info.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/util/resource/package-info.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/util/resource/package-info.java index 1e925d7..d7c799d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/util/resource/package-info.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/util/resource/package-info.java @@ -19,8 +19,4 @@ * Package org.apache.hadoop.yarn.util.resource contains classes * which is used as utility class for resource profile computations. */ -@InterfaceAudience.Public -@InterfaceStability.Unstable -package org.apache.hadoop.yarn.util.resource; -import org.apache.hadoop.classification.InterfaceAudience; -import org.apache.hadoop.classification.InterfaceStability; \ No newline at end of file +package org.apache.hadoop.yarn.util.resource; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/hadoop/blob/15319c79/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 768deb2..20442d9 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -3381,6 +3381,16 @@ 0.0.0.0:8091 + + + yarn.resource-types + + + The resource types to be used for scheduling. Use resource-types.xml + to specify details about the individual resource types. + + + yarn.resourcemanager.display.per-user-apps false @@ -3481,11 +3491,43 @@ - yarn.resource-types + + When yarn.nodemanager.resource.gpu.allowed-gpu-devices=auto specified, + YARN NodeManager needs to run GPU discovery binary (now only support + nvidia-smi) to get GPU-related information. + When value is empty (default), YARN NodeManager will try to locate + discovery executable itself. + An example of the config value is: /usr/local/bin/nvidia-smi + + yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables + + + - The resource types to be used for scheduling. Use resource-types.xml - to specify details about the individual resource types. + Enable additional discovery/isolation of resources on the NodeManager, + split by comma. By default, this is empty. Acceptable values: { "yarn-io/gpu" }. + + yarn.nodemanager.resource-plugins + + + + + + Specify GPU devices which can be managed by YARN NodeManager, split by comma + Number of GPU devices will be reported to RM to make scheduling decisions. + Set to auto (default) let YARN automatically discover GPU resource from + system. + Manually specify GPU devices if auto detect GPU device failed or admin + only want subset of GPU devices managed by YARN. GPU device is identified + by their minor device number. A common approach to get minor device number + of GPUs is using "nvidia-smi -q" and search "Minor Number" output. An + example of manual specification is "0,1,2,4" to allow YARN NodeManager + to manage GPU devices with minor number 0/1/2/4. + yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices + auto + + http://git-wip-us.apache.org/repos/asf/hadoop/blob/15319c79/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/util/resource/TestResourceUtils.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/util/resource/TestResourceUtils.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/util/resource/TestResourceUtils.java index d6bab92..80555ca 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/util/resource/TestResourceUtils.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/util/resource/TestResourceUtils.java @@ -52,6 +52,23 @@ public class TestResourceUtils { } } + public static void addNewTypesToResources(String... resourceTypes) { + // Initialize resource map + Map riMap = new HashMap<>(); + + // Initialize mandatory resources + riMap.put(ResourceInformation.MEMORY_URI, ResourceInformation.MEMORY_MB); + riMap.put(ResourceInformation.VCORES_URI, ResourceInformation.VCORES); + + for (String newResource : resourceTypes) { + riMap.put(newResource, ResourceInformation + .newInstance(newResource, "", 0, ResourceTypes.COUNTABLE, 0, + Integer.MAX_VALUE)); + } + + ResourceUtils.initializeResourcesFromResourceInformationMap(riMap); + } + @Before public void setup() { ResourceUtils.resetResourceTypes(); http://git-wip-us.apache.org/repos/asf/hadoop/blob/15319c79/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java index 9454da4..e65b677 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java @@ -112,9 +112,10 @@ public abstract class ContainerExecutor implements Configurable { * Run the executor initialization steps. * Verify that the necessary configs and permissions are in place. * + * @param nmContext Context of NM * @throws IOException if initialization fails */ - public abstract void init() throws IOException; + public abstract void init(Context nmContext) throws IOException; /** * This function localizes the JAR file on-demand. http://git-wip-us.apache.org/repos/asf/hadoop/blob/15319c79/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/Context.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/Context.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/Context.java index 33cefea..7e16034 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/Context.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/Context.java @@ -34,6 +34,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManag import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator; import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager; @@ -122,4 +123,6 @@ public interface Context { void setNMTimelinePublisher(NMTimelinePublisher nmMetricsPublisher); NMTimelinePublisher getNMTimelinePublisher(); + + ResourcePluginManager getResourcePluginManager(); } http://git-wip-us.apache.org/repos/asf/hadoop/blob/15319c79/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java index b54b7f5..e659c3e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java @@ -134,7 +134,7 @@ public class DefaultContainerExecutor extends ContainerExecutor { } @Override - public void init() throws IOException { + public void init(Context nmContext) throws IOException { // nothing to do or verify here } http://git-wip-us.apache.org/repos/asf/hadoop/blob/15319c79/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java index 765c49a..04f27c2 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java @@ -20,6 +20,7 @@ package org.apache.hadoop.yarn.server.nodemanager; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Optional; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerChain; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -281,7 +282,7 @@ public class LinuxContainerExecutor extends ContainerExecutor { } @Override - public void init() throws IOException { + public void init(Context nmContext) throws IOException { Configuration conf = super.getConf(); // Send command to executor which will just start up, @@ -305,7 +306,7 @@ public class LinuxContainerExecutor extends ContainerExecutor { try { resourceHandlerChain = ResourceHandlerModule - .getConfiguredResourceHandlerChain(conf); + .getConfiguredResourceHandlerChain(conf, nmContext); if (LOG.isDebugEnabled()) { LOG.debug("Resource handler chain enabled = " + (resourceHandlerChain != null)); @@ -845,4 +846,9 @@ public class LinuxContainerExecutor extends ContainerExecutor { e); } } + + @VisibleForTesting + public ResourceHandler getResourceHandler() { + return resourceHandlerChain; + } } http://git-wip-us.apache.org/repos/asf/hadoop/blob/15319c79/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java index 431a894..3f27abe 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java @@ -18,23 +18,7 @@ package org.apache.hadoop.yarn.server.nodemanager; -import java.io.IOException; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.ConcurrentLinkedQueue; -import java.util.concurrent.ConcurrentMap; -import java.util.concurrent.ConcurrentSkipListMap; -import java.util.concurrent.atomic.AtomicBoolean; - -import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent; -import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl; -import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState; -import org.apache.hadoop.yarn.state.MultiStateTransitionListener; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - +import com.google.common.annotations.VisibleForTesting; import org.apache.hadoop.classification.InterfaceAudience.Private; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -65,12 +49,16 @@ import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.server.api.protocolrecords.LogAggregationReport; import org.apache.hadoop.yarn.server.api.records.AppCollectorData; import org.apache.hadoop.yarn.server.api.records.NodeHealthStatus; -import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManager; import org.apache.hadoop.yarn.server.nodemanager.collectormanager.NMCollectorService; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManager; import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationState; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.nodelabels.ConfigurationNodeLabelsProvider; import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeLabelsProvider; @@ -78,14 +66,25 @@ import org.apache.hadoop.yarn.server.nodemanager.nodelabels.ScriptBasedNodeLabel import org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; -import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator; import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager; import org.apache.hadoop.yarn.server.nodemanager.security.NMTokenSecretManagerInNM; import org.apache.hadoop.yarn.server.nodemanager.timelineservice.NMTimelinePublisher; import org.apache.hadoop.yarn.server.nodemanager.webapp.WebServer; +import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; +import org.apache.hadoop.yarn.state.MultiStateTransitionListener; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -import com.google.common.annotations.VisibleForTesting; +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.ConcurrentSkipListMap; +import java.util.concurrent.atomic.AtomicBoolean; public class NodeManager extends CompositeService implements EventHandler { @@ -331,6 +330,18 @@ public class NodeManager extends CompositeService nmCheckintervalTime, scriptTimeout, scriptArgs); } + @VisibleForTesting + protected ResourcePluginManager createResourcePluginManager() { + return new ResourcePluginManager(); + } + + @VisibleForTesting + protected ContainerExecutor createContainerExecutor(Configuration conf) { + return ReflectionUtils.newInstance( + conf.getClass(YarnConfiguration.NM_CONTAINER_EXECUTOR, + DefaultContainerExecutor.class, ContainerExecutor.class), conf); + } + @Override protected void serviceInit(Configuration conf) throws Exception { @@ -359,11 +370,20 @@ public class NodeManager extends CompositeService this.aclsManager = new ApplicationACLsManager(conf); - ContainerExecutor exec = ReflectionUtils.newInstance( - conf.getClass(YarnConfiguration.NM_CONTAINER_EXECUTOR, - DefaultContainerExecutor.class, ContainerExecutor.class), conf); + boolean isDistSchedulingEnabled = + conf.getBoolean(YarnConfiguration.DIST_SCHEDULING_ENABLED, + YarnConfiguration.DEFAULT_DIST_SCHEDULING_ENABLED); + + this.context = createNMContext(containerTokenSecretManager, + nmTokenSecretManager, nmStore, isDistSchedulingEnabled, conf); + + ResourcePluginManager pluginManager = createResourcePluginManager(); + pluginManager.initialize(context); + ((NMContext)context).setResourcePluginManager(pluginManager); + + ContainerExecutor exec = createContainerExecutor(conf); try { - exec.init(); + exec.init(context); } catch (IOException e) { throw new YarnRuntimeException("Failed to initialize container executor", e); } @@ -379,13 +399,6 @@ public class NodeManager extends CompositeService getNodeHealthScriptRunner(conf), dirsHandler); addService(nodeHealthChecker); - boolean isDistSchedulingEnabled = - conf.getBoolean(YarnConfiguration.DIST_SCHEDULING_ENABLED, - YarnConfiguration.DEFAULT_DIST_SCHEDULING_ENABLED); - - this.context = createNMContext(containerTokenSecretManager, - nmTokenSecretManager, nmStore, isDistSchedulingEnabled, conf); - ((NMContext)context).setContainerExecutor(exec); @@ -459,6 +472,12 @@ public class NodeManager extends CompositeService try { super.serviceStop(); DefaultMetricsSystem.shutdown(); + + // Cleanup ResourcePluginManager + ResourcePluginManager rpm = context.getResourcePluginManager(); + if (rpm != null) { + rpm.cleanup(); + } } finally { // YARN-3641: NM's services stop get failed shouldn't block the // release of NMLevelDBStore. @@ -596,6 +615,8 @@ public class NodeManager extends CompositeService private NMTimelinePublisher nmTimelinePublisher; + private ResourcePluginManager resourcePluginManager; + public NMContext(NMContainerTokenSecretManager containerTokenSecretManager, NMTokenSecretManagerInNM nmTokenSecretManager, LocalDirsHandlerService dirsHandler, ApplicationACLsManager aclsManager, @@ -796,6 +817,15 @@ public class NodeManager extends CompositeService public NMTimelinePublisher getNMTimelinePublisher() { return nmTimelinePublisher; } + + public ResourcePluginManager getResourcePluginManager() { + return resourcePluginManager; + } + + public void setResourcePluginManager( + ResourcePluginManager resourcePluginManager) { + this.resourcePluginManager = resourcePluginManager; + } } /** http://git-wip-us.apache.org/repos/asf/hadoop/blob/15319c79/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java index 888ee85..d776bdf 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java @@ -33,6 +33,9 @@ import java.util.Map.Entry; import java.util.Random; import java.util.Set; import java.util.concurrent.ConcurrentLinkedQueue; + +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -178,14 +181,15 @@ public class NodeStatusUpdaterImpl extends AbstractService implements long memoryMb = totalResource.getMemorySize(); float vMemToPMem = conf.getFloat( - YarnConfiguration.NM_VMEM_PMEM_RATIO, - YarnConfiguration.DEFAULT_NM_VMEM_PMEM_RATIO); + YarnConfiguration.NM_VMEM_PMEM_RATIO, + YarnConfiguration.DEFAULT_NM_VMEM_PMEM_RATIO); long virtualMemoryMb = (long)Math.ceil(memoryMb * vMemToPMem); - int virtualCores = totalResource.getVirtualCores(); - LOG.info("Nodemanager resources: memory set to " + memoryMb + "MB."); - LOG.info("Nodemanager resources: vcores set to " + virtualCores + "."); - LOG.info("Nodemanager resources: " + totalResource); + + // Update configured resources via plugins. + updateConfiguredResourcesViaPlugins(totalResource); + + LOG.info("Nodemanager resources is set to: " + totalResource); metrics.addResource(totalResource); @@ -342,12 +346,27 @@ public class NodeStatusUpdaterImpl extends AbstractService implements return ServerRMProxy.createRMProxy(conf, ResourceTracker.class); } + private void updateConfiguredResourcesViaPlugins( + Resource configuredResource) throws YarnException { + ResourcePluginManager pluginManager = context.getResourcePluginManager(); + if (pluginManager != null && pluginManager.getNameToPlugins() != null) { + // Update configured resource + for (ResourcePlugin resourcePlugin : pluginManager.getNameToPlugins() + .values()) { + if (resourcePlugin.getNodeResourceHandlerInstance() != null) { + resourcePlugin.getNodeResourceHandlerInstance() + .updateConfiguredResource(configuredResource); + } + } + } + } + @VisibleForTesting protected void registerWithRM() throws YarnException, IOException { RegisterNodeManagerResponse regNMResponse; Set nodeLabels = nodeLabelsHandler.getNodeLabelsForRegistration(); - + // Synchronize NM-RM registration with // ContainerManagerImpl#increaseContainersResource and // ContainerManagerImpl#startContainers to avoid race condition @@ -358,6 +377,7 @@ public class NodeStatusUpdaterImpl extends AbstractService implements RegisterNodeManagerRequest.newInstance(nodeId, httpPort, totalResource, nodeManagerVersionId, containerReports, getRunningApplications(), nodeLabels, physicalResource); + if (containerReports != null) { LOG.info("Registering with RM using containers :" + containerReports); } @@ -406,7 +426,7 @@ public class NodeStatusUpdaterImpl extends AbstractService implements if (masterKey != null) { this.context.getContainerTokenSecretManager().setMasterKey(masterKey); } - + masterKey = regNMResponse.getNMTokenMasterKey(); if (masterKey != null) { this.context.getNMTokenSecretManager().setMasterKey(masterKey); @@ -733,7 +753,7 @@ public class NodeStatusUpdaterImpl extends AbstractService implements } } } - + @Override public long getRMIdentifier() { return this.rmIdentifier; http://git-wip-us.apache.org/repos/asf/hadoop/blob/15319c79/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperation.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperation.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperation.java index 8402a16..db0b225 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperation.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperation.java @@ -51,6 +51,7 @@ public class PrivilegedOperation { TC_READ_STATS("--tc-read-stats"), ADD_PID_TO_CGROUP(""), //no CLI switch supported yet. RUN_DOCKER_CMD("--run-docker"), + GPU("--module-gpu"), LIST_AS_USER(""); //no CLI switch supported yet. private final String option; http://git-wip-us.apache.org/repos/asf/hadoop/blob/15319c79/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerChain.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerChain.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerChain.java index 955d216..72bf30c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerChain.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerChain.java @@ -20,6 +20,7 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources; +import com.google.common.annotations.VisibleForTesting; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; @@ -135,7 +136,8 @@ public class ResourceHandlerChain implements ResourceHandler { return allOperations; } - List getResourceHandlerList() { + @VisibleForTesting + public List getResourceHandlerList() { return Collections.unmodifiableList(resourceHandlers); } http://git-wip-us.apache.org/repos/asf/hadoop/blob/15319c79/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerModule.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerModule.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerModule.java index 3c61cd4..ce850ab 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerModule.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerModule.java @@ -21,25 +21,28 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources; import com.google.common.annotations.VisibleForTesting; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager; import org.apache.hadoop.yarn.server.nodemanager.util.CgroupsLCEResourcesHandler; import org.apache.hadoop.yarn.server.nodemanager.util.DefaultLCEResourcesHandler; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; -import java.util.Set; -import java.util.HashSet; -import java.util.Map; -import java.util.HashMap; -import java.util.Arrays; import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; import java.util.List; +import java.util.Map; +import java.util.Set; /** * Provides mechanisms to get various resource handlers - cpu, memory, network, @@ -206,22 +209,41 @@ public class ResourceHandlerModule { } private static void initializeConfiguredResourceHandlerChain( - Configuration conf) throws ResourceHandlerException { + Configuration conf, Context nmContext) + throws ResourceHandlerException { ArrayList handlerList = new ArrayList<>(); addHandlerIfNotNull(handlerList, getOutboundBandwidthResourceHandler(conf)); addHandlerIfNotNull(handlerList, getDiskResourceHandler(conf)); addHandlerIfNotNull(handlerList, getMemoryResourceHandler(conf)); addHandlerIfNotNull(handlerList, getCGroupsCpuResourceHandler(conf)); + addHandlersFromConfiguredResourcePlugins(handlerList, conf, nmContext); resourceHandlerChain = new ResourceHandlerChain(handlerList); } + private static void addHandlersFromConfiguredResourcePlugins( + List handlerList, Configuration conf, + Context nmContext) throws ResourceHandlerException { + ResourcePluginManager pluginManager = nmContext.getResourcePluginManager(); + if (pluginManager != null) { + Map pluginMap = pluginManager.getNameToPlugins(); + if (pluginMap != null) { + for (ResourcePlugin plugin : pluginMap.values()) { + addHandlerIfNotNull(handlerList, plugin + .createResourceHandler(nmContext, + getInitializedCGroupsHandler(conf), + PrivilegedOperationExecutor.getInstance(conf))); + } + } + } + } + public static ResourceHandlerChain getConfiguredResourceHandlerChain( - Configuration conf) throws ResourceHandlerException { + Configuration conf, Context nmContext) throws ResourceHandlerException { if (resourceHandlerChain == null) { synchronized (ResourceHandlerModule.class) { if (resourceHandlerChain == null) { - initializeConfiguredResourceHandlerChain(conf); + initializeConfiguredResourceHandlerChain(conf, nmContext); } } } http://git-wip-us.apache.org/repos/asf/hadoop/blob/15319c79/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java new file mode 100644 index 0000000..d6bae09 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java @@ -0,0 +1,242 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Sets; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.yarn.api.records.ContainerId; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.api.records.ResourceInformation; +import org.apache.hadoop.yarn.exceptions.ResourceNotFoundException; +import org.apache.hadoop.yarn.server.nodemanager.Context; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException; + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; +import java.util.TreeSet; + +import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI; + +/** + * Allocate GPU resources according to requirements + */ +public class GpuResourceAllocator { + final static Log LOG = LogFactory.getLog(GpuResourceAllocator.class); + + private Set allowedGpuDevices = new TreeSet<>(); + private Map usedDevices = new TreeMap<>(); + private Context nmContext; + + public GpuResourceAllocator(Context ctx) { + this.nmContext = ctx; + } + + /** + * Contains allowed and denied devices with minor number. + * Denied devices will be useful for cgroups devices module to do blacklisting + */ + static class GpuAllocation { + private Set allowed = Collections.emptySet(); + private Set denied = Collections.emptySet(); + + GpuAllocation(Set allowed, Set denied) { + if (allowed != null) { + this.allowed = ImmutableSet.copyOf(allowed); + } + if (denied != null) { + this.denied = ImmutableSet.copyOf(denied); + } + } + + public Set getAllowedGPUs() { + return allowed; + } + + public Set getDeniedGPUs() { + return denied; + } + } + + /** + * Add GPU to allowed list + * @param minorNumber minor number of the GPU device. + */ + public synchronized void addGpu(int minorNumber) { + allowedGpuDevices.add(minorNumber); + } + + private String getResourceHandlerExceptionMessage(int numRequestedGpuDevices, + ContainerId containerId) { + return "Failed to find enough GPUs, requestor=" + containerId + + ", #RequestedGPUs=" + numRequestedGpuDevices + ", #availableGpus=" + + getAvailableGpus(); + } + + @VisibleForTesting + public synchronized int getAvailableGpus() { + return allowedGpuDevices.size() - usedDevices.size(); + } + + public synchronized void recoverAssignedGpus(ContainerId containerId) + throws ResourceHandlerException { + Container c = nmContext.getContainers().get(containerId); + if (null == c) { + throw new ResourceHandlerException( + "This shouldn't happen, cannot find container with id=" + + containerId); + } + + for (Serializable deviceId : c.getResourceMappings().getAssignedResources( + GPU_URI)){ + if (!(deviceId instanceof String)) { + throw new ResourceHandlerException( + "Trying to recover device id, however it" + + " is not String, this shouldn't happen"); + } + + + int devId; + try { + devId = Integer.parseInt((String)deviceId); + } catch (NumberFormatException e) { + throw new ResourceHandlerException("Failed to recover device id because" + + "it is not a valid integer, devId:" + deviceId); + } + + // Make sure it is in allowed GPU device. + if (!allowedGpuDevices.contains(devId)) { + throw new ResourceHandlerException("Try to recover device id = " + devId + + " however it is not in allowed device list:" + StringUtils + .join(",", allowedGpuDevices)); + } + + // Make sure it is not occupied by anybody else + if (usedDevices.containsKey(devId)) { + throw new ResourceHandlerException("Try to recover device id = " + devId + + " however it is already assigned to container=" + usedDevices + .get(devId) + ", please double check what happened."); + } + + usedDevices.put(devId, containerId); + } + } + + private int getRequestedGpus(Resource requestedResource) { + try { + return Long.valueOf(requestedResource.getResourceValue( + GPU_URI)).intValue(); + } catch (ResourceNotFoundException e) { + return 0; + } + } + + /** + * Assign GPU to requestor + * @param container container to allocate + * @return List of denied Gpus with minor numbers + * @throws ResourceHandlerException When failed to + */ + public synchronized GpuAllocation assignGpus(Container container) + throws ResourceHandlerException { + Resource requestedResource = container.getResource(); + ContainerId containerId = container.getContainerId(); + int numRequestedGpuDevices = getRequestedGpus(requestedResource); + // Assign Gpus to container if requested some. + if (numRequestedGpuDevices > 0) { + if (numRequestedGpuDevices > getAvailableGpus()) { + throw new ResourceHandlerException( + getResourceHandlerExceptionMessage(numRequestedGpuDevices, + containerId)); + } + + Set assignedGpus = new HashSet<>(); + + for (int deviceNum : allowedGpuDevices) { + if (!usedDevices.containsKey(deviceNum)) { + usedDevices.put(deviceNum, containerId); + assignedGpus.add(deviceNum); + if (assignedGpus.size() == numRequestedGpuDevices) { + break; + } + } + } + + // Record in state store if we allocated anything + if (!assignedGpus.isEmpty()) { + List allocatedDevices = new ArrayList<>(); + for (int gpu : assignedGpus) { + allocatedDevices.add(String.valueOf(gpu)); + } + try { + // Update Container#getResourceMapping. + ResourceMappings.AssignedResources assignedResources = + new ResourceMappings.AssignedResources(); + assignedResources.updateAssignedResources(allocatedDevices); + container.getResourceMappings().addAssignedResources(GPU_URI, + assignedResources); + + // Update state store. + nmContext.getNMStateStore().storeAssignedResources(containerId, + GPU_URI, allocatedDevices); + } catch (IOException e) { + cleanupAssignGpus(containerId); + throw new ResourceHandlerException(e); + } + } + + return new GpuAllocation(assignedGpus, + Sets.difference(allowedGpuDevices, assignedGpus)); + } + return new GpuAllocation(null, allowedGpuDevices); + } + + /** + * Clean up all Gpus assigned to containerId + * @param containerId containerId + */ + public synchronized void cleanupAssignGpus(ContainerId containerId) { + Iterator> iter = + usedDevices.entrySet().iterator(); + while (iter.hasNext()) { + if (iter.next().getValue().equals(containerId)) { + iter.remove(); + } + } + } + + @VisibleForTesting + public synchronized Map getDeviceAllocationMapping() { + return new HashMap<>(usedDevices); + } +} http://git-wip-us.apache.org/repos/asf/hadoop/blob/15319c79/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java new file mode 100644 index 0000000..7144bb2 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java @@ -0,0 +1,153 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu; + +import com.google.common.annotations.VisibleForTesting; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.yarn.api.records.ContainerId; +import org.apache.hadoop.yarn.api.records.ResourceInformation; +import org.apache.hadoop.yarn.exceptions.ResourceNotFoundException; +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.server.nodemanager.Context; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationException; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class GpuResourceHandlerImpl implements ResourceHandler { + final static Log LOG = LogFactory + .getLog(GpuResourceHandlerImpl.class); + + // This will be used by container-executor to add necessary clis + public static final String EXCLUDED_GPUS_CLI_OPTION = "--excluded_gpus"; + public static final String CONTAINER_ID_CLI_OPTION = "--container_id"; + + private GpuResourceAllocator gpuAllocator; + private CGroupsHandler cGroupsHandler; + private PrivilegedOperationExecutor privilegedOperationExecutor; + + public GpuResourceHandlerImpl(Context nmContext, + CGroupsHandler cGroupsHandler, + PrivilegedOperationExecutor privilegedOperationExecutor) { + this.cGroupsHandler = cGroupsHandler; + this.privilegedOperationExecutor = privilegedOperationExecutor; + gpuAllocator = new GpuResourceAllocator(nmContext); + } + + @Override + public List bootstrap(Configuration configuration) + throws ResourceHandlerException { + List minorNumbersOfUsableGpus; + try { + minorNumbersOfUsableGpus = GpuDiscoverer.getInstance() + .getMinorNumbersOfGpusUsableByYarn(); + } catch (YarnException e) { + LOG.error("Exception when trying to get usable GPU device", e); + throw new ResourceHandlerException(e); + } + + for (int minorNumber : minorNumbersOfUsableGpus) { + gpuAllocator.addGpu(minorNumber); + } + + // And initialize cgroups + this.cGroupsHandler.initializeCGroupController( + CGroupsHandler.CGroupController.DEVICES); + + return null; + } + + @Override + public synchronized List preStart(Container container) + throws ResourceHandlerException { + String containerIdStr = container.getContainerId().toString(); + + // Assign Gpus to container if requested some. + GpuResourceAllocator.GpuAllocation allocation = gpuAllocator.assignGpus( + container); + + // Create device cgroups for the container + cGroupsHandler.createCGroup(CGroupsHandler.CGroupController.DEVICES, + containerIdStr); + try { + // Execute c-e to setup GPU isolation before launch the container + PrivilegedOperation privilegedOperation = new PrivilegedOperation( + PrivilegedOperation.OperationType.GPU, Arrays + .asList(CONTAINER_ID_CLI_OPTION, containerIdStr)); + if (!allocation.getDeniedGPUs().isEmpty()) { + privilegedOperation.appendArgs(Arrays.asList(EXCLUDED_GPUS_CLI_OPTION, + StringUtils.join(",", allocation.getDeniedGPUs()))); + } + + privilegedOperationExecutor.executePrivilegedOperation( + privilegedOperation, true); + } catch (PrivilegedOperationException e) { + cGroupsHandler.deleteCGroup(CGroupsHandler.CGroupController.DEVICES, + containerIdStr); + LOG.warn("Could not update cgroup for container", e); + throw new ResourceHandlerException(e); + } + + List ret = new ArrayList<>(); + ret.add(new PrivilegedOperation( + PrivilegedOperation.OperationType.ADD_PID_TO_CGROUP, + PrivilegedOperation.CGROUP_ARG_PREFIX + + cGroupsHandler.getPathForCGroupTasks( + CGroupsHandler.CGroupController.DEVICES, containerIdStr))); + + return ret; + } + + @VisibleForTesting + public GpuResourceAllocator getGpuAllocator() { + return gpuAllocator; + } + + @Override + public List reacquireContainer(ContainerId containerId) + throws ResourceHandlerException { + gpuAllocator.recoverAssignedGpus(containerId); + return null; + } + + @Override + public synchronized List postComplete( + ContainerId containerId) throws ResourceHandlerException { + gpuAllocator.cleanupAssignGpus(containerId); + cGroupsHandler.deleteCGroup(CGroupsHandler.CGroupController.DEVICES, + containerId.toString()); + return null; + } + + @Override + public List teardown() throws ResourceHandlerException { + return null; + } +} http://git-wip-us.apache.org/repos/asf/hadoop/blob/15319c79/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/NodeResourceUpdaterPlugin.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/NodeResourceUpdaterPlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/NodeResourceUpdaterPlugin.java new file mode 100644 index 0000000..88f77ed --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/NodeResourceUpdaterPlugin.java @@ -0,0 +1,52 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin; + +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.exceptions.YarnException; + +/** + * Plugins to handle resources on a node. This will be used by + * {@link org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater} + */ +public abstract class NodeResourceUpdaterPlugin { + /** + * Update configured resource for the given component. + * @param res resource passed in by external mododule (such as + * {@link org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater} + * @throws YarnException when any issue happens. + */ + public abstract void updateConfiguredResource(Resource res) + throws YarnException; + + /** + * This method will be called when the node's resource is loaded from + * dynamic-resources.xml in ResourceManager. + * + * @param newResource newResource reported by RM + * @throws YarnException when any mismatch between NM/RM + */ + public void handleUpdatedResourceFromRM(Resource newResource) throws + YarnException { + // by default do nothing, subclass should implement this method when any + // special activities required upon new resource reported by RM. + } + + // TODO: add implementation to update node attribute once YARN-3409 merged. +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/hadoop/blob/15319c79/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePlugin.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePlugin.java new file mode 100644 index 0000000..6e134b3 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePlugin.java @@ -0,0 +1,83 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin; + +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.server.nodemanager.Context; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerChain; + +/** + * {@link ResourcePlugin} is an interface for node manager to easier support + * discovery/manage/isolation for new resource types. + * + *

+ * It has two major part: {@link ResourcePlugin#createResourceHandler(Context, + * CGroupsHandler, PrivilegedOperationExecutor)} and + * {@link ResourcePlugin#getNodeResourceHandlerInstance()}, see javadocs below + * for more details. + *

+ */ +public interface ResourcePlugin { + /** + * Initialize the plugin, this will be invoked during NM startup. + * @param context NM Context + * @throws YarnException when any issue occurs + */ + void initialize(Context context) throws YarnException; + + /** + * Plugin needs to return {@link ResourceHandler} when any special isolation + * required for the resource type. This will be added to + * {@link ResourceHandlerChain} during NodeManager startup. When no special + * isolation need, return null. + * + * @param nmContext NodeManager context. + * @param cGroupsHandler CGroupsHandler + * @param privilegedOperationExecutor Privileged Operation Executor. + * @return ResourceHandler + */ + ResourceHandler createResourceHandler(Context nmContext, + CGroupsHandler cGroupsHandler, + PrivilegedOperationExecutor privilegedOperationExecutor); + + /** + * Plugin needs to return {@link NodeResourceUpdaterPlugin} when any discovery + * mechanism required for the resource type. For example, if we want to set + * resource-value during NM registration or send update during NM-RM heartbeat + * We can implement a {@link NodeResourceUpdaterPlugin} and update fields of + * {@link org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest} + * or {@link org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest} + * + * This will be invoked during every node status update or node registration, + * please avoid creating new instance every time. + * + * @return NodeResourceUpdaterPlugin, could be null when no discovery needed. + */ + NodeResourceUpdaterPlugin getNodeResourceHandlerInstance(); + + /** + * Do cleanup of the plugin, this will be invoked when + * {@link org.apache.hadoop.yarn.server.nodemanager.NodeManager} stops + * @throws YarnException if any issue occurs + */ + void cleanup() throws YarnException; +} http://git-wip-us.apache.org/repos/asf/hadoop/blob/15319c79/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java new file mode 100644 index 0000000..73d6038 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java @@ -0,0 +1,106 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin; + +import com.google.common.collect.ImmutableSet; +import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.server.nodemanager.Context; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuResourcePlugin; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI; + +/** + * Manages {@link ResourcePlugin} configured on this NodeManager. + */ +public class ResourcePluginManager { + private static final Logger LOG = + LoggerFactory.getLogger(ResourcePluginManager.class); + private static final Set SUPPORTED_RESOURCE_PLUGINS = ImmutableSet.of( + GPU_URI); + + private Map configuredPlugins = Collections.EMPTY_MAP; + + public synchronized void initialize(Context context) + throws YarnException { + Configuration conf = context.getConf(); + String[] plugins = conf.getStrings(YarnConfiguration.NM_RESOURCE_PLUGINS); + + if (plugins != null) { + Map pluginMap = new HashMap<>(); + + // Initialize each plugins + for (String resourceName : plugins) { + resourceName = resourceName.trim(); + if (!SUPPORTED_RESOURCE_PLUGINS.contains(resourceName)) { + String msg = + "Trying to initialize resource plugin with name=" + resourceName + + ", it is not supported, list of supported plugins:" + + StringUtils.join(",", + SUPPORTED_RESOURCE_PLUGINS); + LOG.error(msg); + throw new YarnException(msg); + } + + if (pluginMap.containsKey(resourceName)) { + // Duplicated items, ignore ... + continue; + } + + ResourcePlugin plugin = null; + if (resourceName.equals(GPU_URI)) { + plugin = new GpuResourcePlugin(); + } + + if (plugin == null) { + throw new YarnException( + "This shouldn't happen, plugin=" + resourceName + + " should be loaded and initialized"); + } + plugin.initialize(context); + pluginMap.put(resourceName, plugin); + } + + configuredPlugins = Collections.unmodifiableMap(pluginMap); + } + } + + public synchronized void cleanup() throws YarnException { + for (ResourcePlugin plugin : configuredPlugins.values()) { + plugin.cleanup(); + } + } + + /** + * Get resource name (such as gpu/fpga) to plugin references. + * @return read-only map of resource name to plugins. + */ + public synchronized Map getNameToPlugins() { + return configuredPlugins; + } +} --------------------------------------------------------------------- To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org For additional commands, e-mail: common-commits-help@hadoop.apache.org