Return-Path: X-Original-To: apmail-hadoop-yarn-commits-archive@minotaur.apache.org Delivered-To: apmail-hadoop-yarn-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 8D2DA100D2 for ; Thu, 17 Oct 2013 05:34:57 +0000 (UTC) Received: (qmail 97785 invoked by uid 500); 17 Oct 2013 05:34:38 -0000 Delivered-To: apmail-hadoop-yarn-commits-archive@hadoop.apache.org Received: (qmail 97636 invoked by uid 500); 17 Oct 2013 05:34:33 -0000 Mailing-List: contact yarn-commits-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: yarn-commits@hadoop.apache.org Delivered-To: mailing list yarn-commits@hadoop.apache.org Received: (qmail 97524 invoked by uid 99); 17 Oct 2013 05:34:28 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 17 Oct 2013 05:34:28 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 17 Oct 2013 05:34:10 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 1DDD22388C63; Thu, 17 Oct 2013 05:33:14 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1532967 [3/9] - in /hadoop/common/branches/HDFS-4949/hadoop-yarn-project: ./ hadoop-yarn/bin/ hadoop-yarn/conf/ hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/protocolrecords/ hadoop-yarn/hadoop-yarn-api/src/main/java... Date: Thu, 17 Oct 2013 05:33:06 -0000 To: yarn-commits@hadoop.apache.org From: wang@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20131017053314.1DDD22388C63@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Modified: hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/proto/yarn_server_common_service_protos.proto URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/proto/yarn_server_common_service_protos.proto?rev=1532967&r1=1532966&r2=1532967&view=diff ============================================================================== --- hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/proto/yarn_server_common_service_protos.proto (original) +++ hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/proto/yarn_server_common_service_protos.proto Thu Oct 17 05:32:42 2013 @@ -20,6 +20,7 @@ option java_package = "org.apache.hadoop option java_outer_classname = "YarnServerCommonServiceProtos"; option java_generic_services = true; option java_generate_equals_and_hash = true; +package hadoop.yarn; import "yarn_protos.proto"; import "yarn_server_common_protos.proto"; @@ -28,6 +29,7 @@ message RegisterNodeManagerRequestProto optional NodeIdProto node_id = 1; optional int32 http_port = 3; optional ResourceProto resource = 4; + optional string nm_version = 5; } message RegisterNodeManagerResponseProto { @@ -36,6 +38,7 @@ message RegisterNodeManagerResponseProto optional NodeActionProto nodeAction = 3; optional int64 rm_identifier = 4; optional string diagnostics_message = 5; + optional string rm_version = 6; } message NodeHeartbeatRequestProto { @@ -44,7 +47,6 @@ message NodeHeartbeatRequestProto { optional MasterKeyProto last_known_nm_token_master_key = 3; } - message NodeHeartbeatResponseProto { optional int32 response_id = 1; optional MasterKeyProto container_token_master_key = 2; Modified: hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/CMgrCompletedAppsEvent.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/CMgrCompletedAppsEvent.java?rev=1532967&r1=1532966&r2=1532967&view=diff ============================================================================== --- hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/CMgrCompletedAppsEvent.java (original) +++ hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/CMgrCompletedAppsEvent.java Thu Oct 17 05:32:42 2013 @@ -27,13 +27,31 @@ import org.apache.hadoop.yarn.server.nod public class CMgrCompletedAppsEvent extends ContainerManagerEvent { private final List appsToCleanup; + private final Reason reason; - public CMgrCompletedAppsEvent(List appsToCleanup) { + public CMgrCompletedAppsEvent(List appsToCleanup, Reason reason) { super(ContainerManagerEventType.FINISH_APPS); this.appsToCleanup = appsToCleanup; + this.reason = reason; } public List getAppsToCleanup() { return this.appsToCleanup; } + + public Reason getReason() { + return reason; + } + + public static enum Reason { + /** + * Application is killed as NodeManager is shut down + */ + ON_SHUTDOWN, + + /** + * Application is killed by ResourceManager + */ + BY_RESOURCEMANAGER + } } Modified: hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/CMgrCompletedContainersEvent.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/CMgrCompletedContainersEvent.java?rev=1532967&r1=1532966&r2=1532967&view=diff ============================================================================== --- hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/CMgrCompletedContainersEvent.java (original) +++ hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/CMgrCompletedContainersEvent.java Thu Oct 17 05:32:42 2013 @@ -24,10 +24,11 @@ import org.apache.hadoop.yarn.api.record public class CMgrCompletedContainersEvent extends ContainerManagerEvent { - private List containerToCleanup; - private Reason reason; - - public CMgrCompletedContainersEvent(List containersToCleanup, Reason reason) { + private final List containerToCleanup; + private final Reason reason; + + public CMgrCompletedContainersEvent(List containersToCleanup, + Reason reason) { super(ContainerManagerEventType.FINISH_CONTAINERS); this.containerToCleanup = containersToCleanup; this.reason = reason; @@ -36,12 +37,27 @@ public class CMgrCompletedContainersEven public List getContainersToCleanup() { return this.containerToCleanup; } - + public Reason getReason() { return reason; } - + public static enum Reason { - ON_SHUTDOWN, BY_RESOURCEMANAGER + /** + * Container is killed as NodeManager is shutting down + */ + ON_SHUTDOWN, + + /** + * Container is killed as the Nodemanager is re-syncing with the + * ResourceManager + */ + ON_NODEMANAGER_RESYNC, + + /** + * Container is killed on request by the ResourceManager + */ + BY_RESOURCEMANAGER } + } Modified: hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java?rev=1532967&r1=1532966&r2=1532967&view=diff ============================================================================== --- hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java (original) +++ hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java Thu Oct 17 05:32:42 2013 @@ -218,7 +218,7 @@ public abstract class ContainerExecutor retCommand.addAll(Arrays.asList("nice", "-n", Integer.toString(containerSchedPriorityAdjustment))); } - retCommand.addAll(Arrays.asList("bash", "-c", command)); + retCommand.addAll(Arrays.asList("bash", command)); return retCommand.toArray(new String[retCommand.size()]); } Modified: hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java?rev=1532967&r1=1532966&r2=1532967&view=diff ============================================================================== --- hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java (original) +++ hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java Thu Oct 17 05:32:42 2013 @@ -277,7 +277,7 @@ public class DefaultContainerExecutor ex pout.println("echo $$ > " + pidFile.toString() + ".tmp"); pout.println("/bin/mv -f " + pidFile.toString() + ".tmp " + pidFile); String exec = Shell.isSetsidAvailable? "exec setsid" : "exec"; - pout.println(exec + " /bin/bash -c \"" + + pout.println(exec + " /bin/bash \"" + launchDst.toUri().getPath().toString() + "\""); } } Modified: hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java?rev=1532967&r1=1532966&r2=1532967&view=diff ============================================================================== --- hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java (original) +++ hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java Thu Oct 17 05:32:42 2013 @@ -24,11 +24,13 @@ import java.net.InetSocketAddress; import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.Shell.ExitCodeException; import org.apache.hadoop.util.Shell.ShellCommandExecutor; @@ -48,6 +50,8 @@ public class LinuxContainerExecutor exte private static final Log LOG = LogFactory .getLog(LinuxContainerExecutor.class); + private String nonsecureLocalUser; + private Pattern nonsecureLocalUserPattern; private String containerExecutorExe; private LCEResourcesHandler resourcesHandler; private boolean containerSchedPriorityIsSet = false; @@ -70,6 +74,24 @@ public class LinuxContainerExecutor exte .getInt(YarnConfiguration.NM_CONTAINER_EXECUTOR_SCHED_PRIORITY, YarnConfiguration.DEFAULT_NM_CONTAINER_EXECUTOR_SCHED_PRIORITY); } + nonsecureLocalUser = conf.get( + YarnConfiguration.NM_NONSECURE_MODE_LOCAL_USER_KEY, + YarnConfiguration.DEFAULT_NM_NONSECURE_MODE_LOCAL_USER); + nonsecureLocalUserPattern = Pattern.compile( + conf.get(YarnConfiguration.NM_NONSECURE_MODE_USER_PATTERN_KEY, + YarnConfiguration.DEFAULT_NM_NONSECURE_MODE_USER_PATTERN)); + } + + void verifyUsernamePattern(String user) { + if (!UserGroupInformation.isSecurityEnabled() && + !nonsecureLocalUserPattern.matcher(user).matches()) { + throw new IllegalArgumentException("Invalid user name '" + user + "'," + + " it must match '" + nonsecureLocalUserPattern.pattern() + "'"); + } + } + + String getRunAsUser(String user) { + return UserGroupInformation.isSecurityEnabled() ? user : nonsecureLocalUser; } /** @@ -162,9 +184,12 @@ public class LinuxContainerExecutor exte List localDirs, List logDirs) throws IOException, InterruptedException { + verifyUsernamePattern(user); + String runAsUser = getRunAsUser(user); List command = new ArrayList(); addSchedPriorityCommand(command); command.addAll(Arrays.asList(containerExecutorExe, + runAsUser, user, Integer.toString(Commands.INITIALIZE_CONTAINER.getValue()), appId, @@ -218,6 +243,9 @@ public class LinuxContainerExecutor exte String user, String appId, Path containerWorkDir, List localDirs, List logDirs) throws IOException { + verifyUsernamePattern(user); + String runAsUser = getRunAsUser(user); + ContainerId containerId = container.getContainerId(); String containerIdStr = ConverterUtils.toString(containerId); @@ -234,7 +262,7 @@ public class LinuxContainerExecutor exte List command = new ArrayList(); addSchedPriorityCommand(command); command.addAll(Arrays.asList( - containerExecutorExe, user, Integer + containerExecutorExe, runAsUser, user, Integer .toString(Commands.LAUNCH_CONTAINER.getValue()), appId, containerIdStr, containerWorkDir.toString(), nmPrivateCotainerScriptPath.toUri().getPath().toString(), @@ -293,8 +321,12 @@ public class LinuxContainerExecutor exte public boolean signalContainer(String user, String pid, Signal signal) throws IOException { + verifyUsernamePattern(user); + String runAsUser = getRunAsUser(user); + String[] command = new String[] { containerExecutorExe, + runAsUser, user, Integer.toString(Commands.SIGNAL_CONTAINER.getValue()), pid, @@ -322,8 +354,12 @@ public class LinuxContainerExecutor exte @Override public void deleteAsUser(String user, Path dir, Path... baseDirs) { + verifyUsernamePattern(user); + String runAsUser = getRunAsUser(user); + List command = new ArrayList( Arrays.asList(containerExecutorExe, + runAsUser, user, Integer.toString(Commands.DELETE_AS_USER.getValue()), dir == null ? "" : dir.toUri().getPath())); Modified: hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java?rev=1532967&r1=1532966&r2=1532967&view=diff ============================================================================== --- hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java (original) +++ hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java Thu Oct 17 05:32:42 2013 @@ -19,9 +19,6 @@ package org.apache.hadoop.yarn.server.nodemanager; import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.ConcurrentSkipListMap; @@ -31,6 +28,8 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.classification.InterfaceAudience.Private; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.http.HttpConfig; +import org.apache.hadoop.http.HttpConfig.Policy; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; import org.apache.hadoop.security.SecurityUtil; import org.apache.hadoop.service.CompositeService; @@ -67,11 +66,6 @@ public class NodeManager extends Composi * Priority of the NodeManager shutdown hook. */ public static final int SHUTDOWN_HOOK_PRIORITY = 30; - - /** - * Extra duration to wait for containers to be killed on shutdown. - */ - private static final int SHUTDOWN_CLEANUP_SLOP_MS = 1000; private static final Log LOG = LogFactory.getLog(NodeManager.class); protected final NodeManagerMetrics metrics = NodeManagerMetrics.create(); @@ -84,8 +78,6 @@ public class NodeManager extends Composi private NodeStatusUpdater nodeStatusUpdater; private static CompositeServiceShutdownHook nodeManagerShutdownHook; - private long waitForContainersOnShutdownMillis; - private AtomicBoolean isStopping = new AtomicBoolean(false); public NodeManager() { @@ -193,13 +185,6 @@ public class NodeManager extends Composi // so that we make sure everything is up before registering with RM. addService(nodeStatusUpdater); - waitForContainersOnShutdownMillis = - conf.getLong(YarnConfiguration.NM_SLEEP_DELAY_BEFORE_SIGKILL_MS, - YarnConfiguration.DEFAULT_NM_SLEEP_DELAY_BEFORE_SIGKILL_MS) + - conf.getLong(YarnConfiguration.NM_PROCESS_KILL_WAIT_MS, - YarnConfiguration.DEFAULT_NM_PROCESS_KILL_WAIT_MS) + - SHUTDOWN_CLEANUP_SLOP_MS; - super.serviceInit(conf); // TODO add local dirs to del } @@ -219,9 +204,6 @@ public class NodeManager extends Composi if (isStopping.getAndSet(true)) { return; } - if (context != null) { - cleanupContainers(NodeManagerEventType.SHUTDOWN); - } super.serviceStop(); DefaultMetricsSystem.shutdown(); } @@ -246,68 +228,13 @@ public class NodeManager extends Composi public void run() { LOG.info("Notifying ContainerManager to block new container-requests"); containerManager.setBlockNewContainerRequests(true); - cleanupContainers(NodeManagerEventType.RESYNC); + LOG.info("Cleaning up running containers on resync"); + containerManager.cleanupContainersOnNMResync(); ((NodeStatusUpdaterImpl) nodeStatusUpdater ).rebootNodeStatusUpdater(); } }.start(); } - @SuppressWarnings("unchecked") - protected void cleanupContainers(NodeManagerEventType eventType) { - Map containers = context.getContainers(); - if (containers.isEmpty()) { - return; - } - LOG.info("Containers still running on " + eventType + " : " - + containers.keySet()); - - List containerIds = - new ArrayList(containers.keySet()); - dispatcher.getEventHandler().handle( - new CMgrCompletedContainersEvent(containerIds, - CMgrCompletedContainersEvent.Reason.ON_SHUTDOWN)); - - LOG.info("Waiting for containers to be killed"); - - switch (eventType) { - case SHUTDOWN: - long waitStartTime = System.currentTimeMillis(); - while (!containers.isEmpty() - && System.currentTimeMillis() - waitStartTime < waitForContainersOnShutdownMillis) { - try { - //To remove done containers in NM context - nodeStatusUpdater.getNodeStatusAndUpdateContainersInContext(); - Thread.sleep(1000); - } catch (InterruptedException ex) { - LOG.warn("Interrupted while sleeping on container kill on shutdown", - ex); - } - } - break; - case RESYNC: - while (!containers.isEmpty()) { - try { - Thread.sleep(1000); - nodeStatusUpdater.getNodeStatusAndUpdateContainersInContext(); - } catch (InterruptedException ex) { - LOG.warn("Interrupted while sleeping on container kill on resync", - ex); - } - } - break; - default: - LOG.warn("Invalid eventType: " + eventType); - } - - // All containers killed - if (containers.isEmpty()) { - LOG.info("All containers in DONE state"); - } else { - LOG.info("Done waiting for containers to be killed. Still alive: " + - containers.keySet()); - } - } - public static class NMContext implements Context { private NodeId nodeId = null; @@ -470,9 +397,16 @@ public class NodeManager extends Composi StringUtils.startupShutdownMessage(NodeManager.class, args, LOG); NodeManager nodeManager = new NodeManager(); Configuration conf = new YarnConfiguration(); + setHttpPolicy(conf); nodeManager.initAndStartNodeManager(conf, false); } + private static void setHttpPolicy(Configuration conf) { + HttpConfig.setPolicy(Policy.fromString(conf.get( + YarnConfiguration.YARN_HTTP_POLICY_KEY, + YarnConfiguration.YARN_HTTP_POLICY_DEFAULT))); + } + @VisibleForTesting @Private public NodeStatusUpdater getNodeStatusUpdater() { Modified: hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java?rev=1532967&r1=1532966&r2=1532967&view=diff ============================================================================== --- hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java (original) +++ hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java Thu Oct 17 05:32:42 2013 @@ -37,6 +37,7 @@ import org.apache.hadoop.conf.Configurat import org.apache.hadoop.ipc.RPC; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.service.AbstractService; +import org.apache.hadoop.util.VersionUtil; import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerState; @@ -63,6 +64,7 @@ import org.apache.hadoop.yarn.server.api import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; +import org.apache.hadoop.yarn.util.YarnVersionInfo; import com.google.common.annotations.VisibleForTesting; @@ -84,6 +86,8 @@ public class NodeStatusUpdaterImpl exten private ResourceTracker resourceTracker; private Resource totalResource; private int httpPort; + private String nodeManagerVersionId; + private String minimumResourceManagerVersion; private volatile boolean isStopped; private RecordFactory recordFactory = RecordFactoryProvider.getRecordFactory(null); private boolean tokenKeepAliveEnabled; @@ -138,6 +142,10 @@ public class NodeStatusUpdaterImpl exten this.tokenRemovalDelayMs = conf.getInt(YarnConfiguration.RM_NM_EXPIRY_INTERVAL_MS, YarnConfiguration.DEFAULT_RM_NM_EXPIRY_INTERVAL_MS); + + this.minimumResourceManagerVersion = conf.get( + YarnConfiguration.NM_RESOURCEMANAGER_MINIMUM_VERSION, + YarnConfiguration.DEFAULT_NM_RESOURCEMANAGER_MINIMUM_VERSION); // Default duration to track stopped containers on nodemanager is 10Min. // This should not be assigned very large value as it will remember all the @@ -168,6 +176,7 @@ public class NodeStatusUpdaterImpl exten // NodeManager is the last service to start, so NodeId is available. this.nodeId = this.context.getNodeId(); this.httpPort = this.context.getHttpPort(); + this.nodeManagerVersionId = YarnVersionInfo.getVersion(); try { // Registration has to be in start so that ContainerManager can get the // perNM tokens needed to authenticate ContainerTokens. @@ -235,6 +244,7 @@ public class NodeStatusUpdaterImpl exten request.setHttpPort(this.httpPort); request.setResource(this.totalResource); request.setNodeId(this.nodeId); + request.setNMVersion(this.nodeManagerVersionId); RegisterNodeManagerResponse regNMResponse = resourceTracker.registerNodeManager(request); this.rmIdentifier = regNMResponse.getRMIdentifier(); @@ -248,6 +258,26 @@ public class NodeStatusUpdaterImpl exten + message); } + // if ResourceManager version is too old then shutdown + if (!minimumResourceManagerVersion.equals("NONE")){ + if (minimumResourceManagerVersion.equals("EqualToNM")){ + minimumResourceManagerVersion = nodeManagerVersionId; + } + String rmVersion = regNMResponse.getRMVersion(); + if (rmVersion == null) { + String message = "The Resource Manager's did not return a version. " + + "Valid version cannot be checked."; + throw new YarnRuntimeException("Shutting down the Node Manager. " + + message); + } + if (VersionUtil.compareVersions(rmVersion,minimumResourceManagerVersion) < 0) { + String message = "The Resource Manager's version (" + + rmVersion +") is less than the minimum " + + "allowed version " + minimumResourceManagerVersion; + throw new YarnRuntimeException("Shutting down the Node Manager on RM " + + "version error, " + message); + } + } MasterKey masterKey = regNMResponse.getContainerTokenMasterKey(); // do this now so that its set before we start heartbeating to RM // It is expected that status updater is started by this point and @@ -469,18 +499,19 @@ public class NodeStatusUpdaterImpl exten lastHeartBeatID = response.getResponseId(); List containersToCleanup = response .getContainersToCleanup(); - if (containersToCleanup.size() != 0) { + if (!containersToCleanup.isEmpty()) { dispatcher.getEventHandler().handle( - new CMgrCompletedContainersEvent(containersToCleanup, - CMgrCompletedContainersEvent.Reason.BY_RESOURCEMANAGER)); + new CMgrCompletedContainersEvent(containersToCleanup, + CMgrCompletedContainersEvent.Reason.BY_RESOURCEMANAGER)); } List appsToCleanup = response.getApplicationsToCleanup(); //Only start tracking for keepAlive on FINISH_APP trackAppsForKeepAlive(appsToCleanup); - if (appsToCleanup.size() != 0) { + if (!appsToCleanup.isEmpty()) { dispatcher.getEventHandler().handle( - new CMgrCompletedAppsEvent(appsToCleanup)); + new CMgrCompletedAppsEvent(appsToCleanup, + CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER)); } } catch (ConnectException e) { //catch and throw the exception if tried MAX wait time to connect RM Modified: hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/AuxServices.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/AuxServices.java?rev=1532967&r1=1532966&r2=1532967&view=diff ============================================================================== --- hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/AuxServices.java (original) +++ hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/AuxServices.java Thu Oct 17 05:32:42 2013 @@ -24,6 +24,7 @@ import java.util.Collections; import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; +import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -40,6 +41,8 @@ import org.apache.hadoop.yarn.server.api import org.apache.hadoop.yarn.server.api.ContainerInitializationContext; import org.apache.hadoop.yarn.server.api.ContainerTerminationContext; +import com.google.common.base.Preconditions; + public class AuxServices extends AbstractService implements ServiceStateChangeListener, EventHandler { @@ -48,6 +51,8 @@ public class AuxServices extends Abstrac protected final Map serviceMap; protected final Map serviceMetaData; + private final Pattern p = Pattern.compile("^[A-Za-z_]+[A-Za-z0-9_]*$"); + public AuxServices() { super(AuxServices.class.getName()); serviceMap = @@ -90,6 +95,13 @@ public class AuxServices extends Abstrac YarnConfiguration.NM_AUX_SERVICES); for (final String sName : auxNames) { try { + Preconditions + .checkArgument( + validateAuxServiceName(sName), + "The ServiceName: " + sName + " set in " + + YarnConfiguration.NM_AUX_SERVICES +" is invalid." + + "The valid service name should only contain a-zA-Z0-9_ " + + "and can not start with numbers"); Class sClass = conf.getClass( String.format(YarnConfiguration.NM_AUX_SERVICE_FMT, sName), null, AuxiliaryService.class); @@ -163,40 +175,70 @@ public class AuxServices extends Abstrac LOG.info("Got event " + event.getType() + " for appId " + event.getApplicationID()); switch (event.getType()) { - case APPLICATION_INIT: - LOG.info("Got APPLICATION_INIT for service " + event.getServiceID()); - AuxiliaryService service = serviceMap.get(event.getServiceID()); - if (null == service) { - LOG.info("service is null"); - // TODO kill all containers waiting on Application - return; - } - service.initializeApplication(new ApplicationInitializationContext(event - .getUser(), event.getApplicationID(), event.getServiceData())); - break; - case APPLICATION_STOP: - for (AuxiliaryService serv : serviceMap.values()) { - serv.stopApplication(new ApplicationTerminationContext(event - .getApplicationID())); - } - break; - case CONTAINER_INIT: - for (AuxiliaryService serv : serviceMap.values()) { - serv.initializeContainer(new ContainerInitializationContext( - event.getUser(), event.getContainer().getContainerId(), - event.getContainer().getResource())); - } - break; - case CONTAINER_STOP: - for (AuxiliaryService serv : serviceMap.values()) { - serv.stopContainer(new ContainerTerminationContext( - event.getUser(), event.getContainer().getContainerId(), - event.getContainer().getResource())); - } - break; + case APPLICATION_INIT: + LOG.info("Got APPLICATION_INIT for service " + event.getServiceID()); + AuxiliaryService service = null; + try { + service = serviceMap.get(event.getServiceID()); + service + .initializeApplication(new ApplicationInitializationContext(event + .getUser(), event.getApplicationID(), event.getServiceData())); + } catch (Throwable th) { + logWarningWhenAuxServiceThrowExceptions(service, + AuxServicesEventType.APPLICATION_INIT, th); + } + break; + case APPLICATION_STOP: + for (AuxiliaryService serv : serviceMap.values()) { + try { + serv.stopApplication(new ApplicationTerminationContext(event + .getApplicationID())); + } catch (Throwable th) { + logWarningWhenAuxServiceThrowExceptions(serv, + AuxServicesEventType.APPLICATION_STOP, th); + } + } + break; + case CONTAINER_INIT: + for (AuxiliaryService serv : serviceMap.values()) { + try { + serv.initializeContainer(new ContainerInitializationContext( + event.getUser(), event.getContainer().getContainerId(), + event.getContainer().getResource())); + } catch (Throwable th) { + logWarningWhenAuxServiceThrowExceptions(serv, + AuxServicesEventType.CONTAINER_INIT, th); + } + } + break; + case CONTAINER_STOP: + for (AuxiliaryService serv : serviceMap.values()) { + try { + serv.stopContainer(new ContainerTerminationContext( + event.getUser(), event.getContainer().getContainerId(), + event.getContainer().getResource())); + } catch (Throwable th) { + logWarningWhenAuxServiceThrowExceptions(serv, + AuxServicesEventType.CONTAINER_STOP, th); + } + } + break; default: - throw new RuntimeException("Unknown type: " + event.getType()); + throw new RuntimeException("Unknown type: " + event.getType()); } } + private boolean validateAuxServiceName(String name) { + if (name == null || name.trim().isEmpty()) { + return false; + } + return p.matcher(name).matches(); + } + + private void logWarningWhenAuxServiceThrowExceptions(AuxiliaryService service, + AuxServicesEventType eventType, Throwable th) { + LOG.warn((null == service ? "The auxService is null" + : "The auxService name is " + service.getName()) + + " and it got an error at event: " + eventType, th); + } } Modified: hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java?rev=1532967&r1=1532966&r2=1532967&view=diff ============================================================================== --- hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java (original) +++ hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java Thu Oct 17 05:32:42 2013 @@ -30,6 +30,9 @@ import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.locks.ReentrantReadWriteLock; +import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock; +import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -67,9 +70,11 @@ import org.apache.hadoop.yarn.api.record import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.event.AsyncDispatcher; import org.apache.hadoop.yarn.event.EventHandler; +import org.apache.hadoop.yarn.exceptions.InvalidAuxServiceException; import org.apache.hadoop.yarn.exceptions.InvalidContainerException; import org.apache.hadoop.yarn.exceptions.NMNotYetReadyException; import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; import org.apache.hadoop.yarn.ipc.RPCUtil; import org.apache.hadoop.yarn.ipc.YarnRPC; import org.apache.hadoop.yarn.security.ContainerTokenIdentifier; @@ -119,6 +124,11 @@ public class ContainerManagerImpl extend ServiceStateChangeListener, ContainerManagementProtocol, EventHandler { + /** + * Extra duration to wait for applications to be killed on shutdown. + */ + private static final int SHUTDOWN_CLEANUP_SLOP_MS = 1000; + private static final Log LOG = LogFactory.getLog(ContainerManagerImpl.class); final Context context; @@ -137,6 +147,11 @@ public class ContainerManagerImpl extend private final DeletionService deletionService; private AtomicBoolean blockNewContainerRequests = new AtomicBoolean(false); + private boolean serviceStopped = false; + private final ReadLock readLock; + private final WriteLock writeLock; + + private long waitForContainersOnShutdownMillis; public ContainerManagerImpl(Context context, ContainerExecutor exec, DeletionService deletionContext, NodeStatusUpdater nodeStatusUpdater, @@ -180,6 +195,10 @@ public class ContainerManagerImpl extend dispatcher.register(ContainersLauncherEventType.class, containersLauncher); addService(dispatcher); + + ReentrantReadWriteLock lock = new ReentrantReadWriteLock(); + this.readLock = lock.readLock(); + this.writeLock = lock.writeLock(); } @Override @@ -189,6 +208,13 @@ public class ContainerManagerImpl extend addIfService(logHandler); dispatcher.register(LogHandlerEventType.class, logHandler); + waitForContainersOnShutdownMillis = + conf.getLong(YarnConfiguration.NM_SLEEP_DELAY_BEFORE_SIGKILL_MS, + YarnConfiguration.DEFAULT_NM_SLEEP_DELAY_BEFORE_SIGKILL_MS) + + conf.getLong(YarnConfiguration.NM_PROCESS_KILL_WAIT_MS, + YarnConfiguration.DEFAULT_NM_PROCESS_KILL_WAIT_MS) + + SHUTDOWN_CLEANUP_SLOP_MS; + super.serviceInit(conf); } @@ -274,6 +300,16 @@ public class ContainerManagerImpl extend @Override public void serviceStop() throws Exception { + setBlockNewContainerRequests(true); + this.writeLock.lock(); + try { + serviceStopped = true; + if (context != null) { + cleanUpApplicationsOnNMShutDown(); + } + } finally { + this.writeLock.unlock(); + } if (auxiliaryServices.getServiceState() == STARTED) { auxiliaryServices.unregisterServiceListener(this); } @@ -283,6 +319,76 @@ public class ContainerManagerImpl extend super.serviceStop(); } + public void cleanUpApplicationsOnNMShutDown() { + Map applications = + this.context.getApplications(); + if (applications.isEmpty()) { + return; + } + LOG.info("Applications still running : " + applications.keySet()); + + List appIds = + new ArrayList(applications.keySet()); + this.handle( + new CMgrCompletedAppsEvent(appIds, + CMgrCompletedAppsEvent.Reason.ON_SHUTDOWN)); + + LOG.info("Waiting for Applications to be Finished"); + + long waitStartTime = System.currentTimeMillis(); + while (!applications.isEmpty() + && System.currentTimeMillis() - waitStartTime < waitForContainersOnShutdownMillis) { + try { + Thread.sleep(1000); + } catch (InterruptedException ex) { + LOG.warn( + "Interrupted while sleeping on applications finish on shutdown", ex); + } + } + + // All applications Finished + if (applications.isEmpty()) { + LOG.info("All applications in FINISHED state"); + } else { + LOG.info("Done waiting for Applications to be Finished. Still alive: " + + applications.keySet()); + } + } + + public void cleanupContainersOnNMResync() { + Map containers = context.getContainers(); + if (containers.isEmpty()) { + return; + } + LOG.info("Containers still running on " + + CMgrCompletedContainersEvent.Reason.ON_NODEMANAGER_RESYNC + " : " + + containers.keySet()); + + List containerIds = + new ArrayList(containers.keySet()); + + LOG.info("Waiting for containers to be killed"); + + this.handle(new CMgrCompletedContainersEvent(containerIds, + CMgrCompletedContainersEvent.Reason.ON_NODEMANAGER_RESYNC)); + while (!containers.isEmpty()) { + try { + Thread.sleep(1000); + nodeStatusUpdater.getNodeStatusAndUpdateContainersInContext(); + } catch (InterruptedException ex) { + LOG.warn("Interrupted while sleeping on container kill on resync", ex); + } + } + + // All containers killed + if (containers.isEmpty()) { + LOG.info("All containers in DONE state"); + } else { + LOG.info("Done waiting for containers to be killed. Still alive: " + + containers.keySet()); + } + } + // Get the remoteUGI corresponding to the api call. protected UserGroupInformation getRemoteUgi() throws YarnException { @@ -451,6 +557,18 @@ public class ContainerManagerImpl extend ContainerLaunchContext launchContext = request.getContainerLaunchContext(); + Map serviceData = getAuxServiceMetaData(); + if (launchContext.getServiceData()!=null && + !launchContext.getServiceData().isEmpty()) { + for (Map.Entry meta : launchContext.getServiceData() + .entrySet()) { + if (null == serviceData.get(meta.getKey())) { + throw new InvalidAuxServiceException("The auxService:" + meta.getKey() + + " does not exist"); + } + } + } + Credentials credentials = parseCredentials(launchContext); Container container = @@ -466,29 +584,40 @@ public class ContainerManagerImpl extend + " already is running on this node!!"); } - // Create the application - Application application = - new ApplicationImpl(dispatcher, user, applicationID, credentials, context); - if (null == context.getApplications().putIfAbsent(applicationID, - application)) { - LOG.info("Creating a new application reference for app " + applicationID); - - dispatcher.getEventHandler().handle( - new ApplicationInitEvent(applicationID, container.getLaunchContext() - .getApplicationACLs())); - } + this.readLock.lock(); + try { + if (!serviceStopped) { + // Create the application + Application application = + new ApplicationImpl(dispatcher, user, applicationID, credentials, context); + if (null == context.getApplications().putIfAbsent(applicationID, + application)) { + LOG.info("Creating a new application reference for app " + applicationID); + + dispatcher.getEventHandler().handle( + new ApplicationInitEvent(applicationID, container.getLaunchContext() + .getApplicationACLs())); + } - dispatcher.getEventHandler().handle( - new ApplicationContainerInitEvent(container)); + dispatcher.getEventHandler().handle( + new ApplicationContainerInitEvent(container)); - this.context.getContainerTokenSecretManager().startContainerSuccessful( - containerTokenIdentifier); - NMAuditLogger.logSuccess(user, AuditConstants.START_CONTAINER, - "ContainerManageImpl", applicationID, containerId); - // TODO launchedContainer misplaced -> doesn't necessarily mean a container - // launch. A finished Application will not launch containers. - metrics.launchedContainer(); - metrics.allocateContainer(containerTokenIdentifier.getResource()); + this.context.getContainerTokenSecretManager().startContainerSuccessful( + containerTokenIdentifier); + NMAuditLogger.logSuccess(user, AuditConstants.START_CONTAINER, + "ContainerManageImpl", applicationID, containerId); + // TODO launchedContainer misplaced -> doesn't necessarily mean a container + // launch. A finished Application will not launch containers. + metrics.launchedContainer(); + metrics.allocateContainer(containerTokenIdentifier.getResource()); + } else { + throw new YarnException( + "Container start failed as the NodeManager is " + + "in the process of shutting down"); + } + } finally { + this.readLock.unlock(); + } } protected ContainerTokenIdentifier verifyAndGetContainerTokenIdentifier( @@ -713,9 +842,15 @@ public class ContainerManagerImpl extend CMgrCompletedAppsEvent appsFinishedEvent = (CMgrCompletedAppsEvent) event; for (ApplicationId appID : appsFinishedEvent.getAppsToCleanup()) { + String diagnostic = ""; + if (appsFinishedEvent.getReason() == CMgrCompletedAppsEvent.Reason.ON_SHUTDOWN) { + diagnostic = "Application killed on shutdown"; + } else if (appsFinishedEvent.getReason() == CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER) { + diagnostic = "Application killed by ResourceManager"; + } this.dispatcher.getEventHandler().handle( new ApplicationFinishEvent(appID, - "Application Killed by ResourceManager")); + diagnostic)); } break; case FINISH_CONTAINERS: @@ -723,20 +858,14 @@ public class ContainerManagerImpl extend (CMgrCompletedContainersEvent) event; for (ContainerId container : containersFinishedEvent .getContainersToCleanup()) { - String diagnostic = ""; - if (containersFinishedEvent.getReason() == - CMgrCompletedContainersEvent.Reason.ON_SHUTDOWN) { - diagnostic = "Container Killed on Shutdown"; - } else if (containersFinishedEvent.getReason() == - CMgrCompletedContainersEvent.Reason.BY_RESOURCEMANAGER) { - diagnostic = "Container Killed by ResourceManager"; - } - this.dispatcher.getEventHandler().handle( - new ContainerKillEvent(container, diagnostic)); + this.dispatcher.getEventHandler().handle( + new ContainerKillEvent(container, + "Container Killed by ResourceManager")); } break; default: - LOG.warn("Invalid event " + event.getType() + ". Ignoring."); + throw new YarnRuntimeException( + "Got an unknown ContainerManagerEvent type: " + event.getType()); } } Modified: hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/application/ApplicationImpl.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/application/ApplicationImpl.java?rev=1532967&r1=1532966&r2=1532967&view=diff ============================================================================== --- hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/application/ApplicationImpl.java (original) +++ hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/application/ApplicationImpl.java Thu Oct 17 05:32:42 2013 @@ -177,6 +177,13 @@ public class ApplicationImpl implements ApplicationState.APPLICATION_RESOURCES_CLEANINGUP), ApplicationEventType.APPLICATION_CONTAINER_FINISHED, new AppFinishTransition()) + .addTransition(ApplicationState.FINISHING_CONTAINERS_WAIT, + ApplicationState.FINISHING_CONTAINERS_WAIT, + EnumSet.of( + ApplicationEventType.APPLICATION_LOG_HANDLING_INITED, + ApplicationEventType.APPLICATION_LOG_HANDLING_FAILED, + ApplicationEventType.APPLICATION_INITED, + ApplicationEventType.FINISH_APPLICATION)) // Transitions from APPLICATION_RESOURCES_CLEANINGUP state .addTransition(ApplicationState.APPLICATION_RESOURCES_CLEANINGUP, @@ -186,12 +193,25 @@ public class ApplicationImpl implements ApplicationState.FINISHED, ApplicationEventType.APPLICATION_RESOURCES_CLEANEDUP, new AppCompletelyDoneTransition()) + .addTransition(ApplicationState.APPLICATION_RESOURCES_CLEANINGUP, + ApplicationState.APPLICATION_RESOURCES_CLEANINGUP, + EnumSet.of( + ApplicationEventType.APPLICATION_LOG_HANDLING_INITED, + ApplicationEventType.APPLICATION_LOG_HANDLING_FAILED, + ApplicationEventType.APPLICATION_LOG_HANDLING_FINISHED, + ApplicationEventType.APPLICATION_INITED, + ApplicationEventType.FINISH_APPLICATION)) // Transitions from FINISHED state .addTransition(ApplicationState.FINISHED, ApplicationState.FINISHED, ApplicationEventType.APPLICATION_LOG_HANDLING_FINISHED, new AppLogsAggregatedTransition()) + .addTransition(ApplicationState.FINISHED, ApplicationState.FINISHED, + EnumSet.of( + ApplicationEventType.APPLICATION_LOG_HANDLING_INITED, + ApplicationEventType.APPLICATION_LOG_HANDLING_FAILED, + ApplicationEventType.FINISH_APPLICATION)) // create the topology tables .installTopology(); @@ -343,7 +363,7 @@ public class ApplicationImpl implements @Override public ApplicationState transition(ApplicationImpl app, ApplicationEvent event) { - + ApplicationFinishEvent appEvent = (ApplicationFinishEvent)event; if (app.containers.isEmpty()) { // No container to cleanup. Cleanup app level resources. app.handleAppFinishWithContainersCleanedup(); @@ -355,7 +375,7 @@ public class ApplicationImpl implements for (ContainerId containerID : app.containers.keySet()) { app.dispatcher.getEventHandler().handle( new ContainerKillEvent(containerID, - "Container killed on application-finish event from RM.")); + "Container killed on application-finish event: " + appEvent.getDiagnostic())); } return ApplicationState.FINISHING_CONTAINERS_WAIT; } @@ -395,6 +415,7 @@ public class ApplicationImpl implements app.dispatcher.getEventHandler().handle( new LogHandlerAppFinishedEvent(app.appId)); + app.context.getNMTokenSecretManager().appFinished(app.getAppId()); } } Modified: hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java?rev=1532967&r1=1532966&r2=1532967&view=diff ============================================================================== --- hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java (original) +++ hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java Thu Oct 17 05:32:42 2013 @@ -68,6 +68,7 @@ import org.apache.hadoop.yarn.server.nod import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEventType; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerExitEvent; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState; import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer; import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService; import org.apache.hadoop.yarn.server.nodemanager.util.ProcessIdFileReader; @@ -133,10 +134,22 @@ public class ContainerLaunch implements final List command = launchContext.getCommands(); int ret = -1; + // CONTAINER_KILLED_ON_REQUEST should not be missed if the container + // is already at KILLING + if (container.getContainerState() == ContainerState.KILLING) { + dispatcher.getEventHandler().handle( + new ContainerExitEvent(containerID, + ContainerEventType.CONTAINER_KILLED_ON_REQUEST, + Shell.WINDOWS ? ExitCode.FORCE_KILLED.getExitCode() : + ExitCode.TERMINATED.getExitCode(), + "Container terminated before launch.")); + return 0; + } + try { localResources = container.getLocalizedResources(); if (localResources == null) { - RPCUtil.getRemoteException( + throw RPCUtil.getRemoteException( "Unable to get local resources when Container " + containerID + " is at " + container.getContainerState()); } Modified: hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java?rev=1532967&r1=1532966&r2=1532967&view=diff ============================================================================== --- hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java (original) +++ hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java Thu Oct 17 05:32:42 2013 @@ -75,20 +75,9 @@ public class ContainersLauncher extends new ThreadFactoryBuilder() .setNameFormat("ContainersLauncher #%d") .build()); - private final Map running = - Collections.synchronizedMap(new HashMap()); - - private static final class RunningContainer { - public RunningContainer(Future submit, - ContainerLaunch launcher) { - this.runningcontainer = submit; - this.launcher = launcher; - } - - Future runningcontainer; - ContainerLaunch launcher; - } - + @VisibleForTesting + public final Map running = + Collections.synchronizedMap(new HashMap()); public ContainersLauncher(Context context, Dispatcher dispatcher, ContainerExecutor exec, LocalDirsHandlerService dirsHandler, @@ -133,38 +122,20 @@ public class ContainersLauncher extends ContainerLaunch launch = new ContainerLaunch(context, getConfig(), dispatcher, exec, app, event.getContainer(), dirsHandler, containerManager); - running.put(containerId, - new RunningContainer(containerLauncher.submit(launch), - launch)); + containerLauncher.submit(launch); + running.put(containerId, launch); break; case CLEANUP_CONTAINER: - RunningContainer rContainerDatum = running.remove(containerId); - if (rContainerDatum == null) { + ContainerLaunch launcher = running.remove(containerId); + if (launcher == null) { // Container not launched. So nothing needs to be done. return; } - Future rContainer = rContainerDatum.runningcontainer; - if (rContainer != null - && !rContainer.isDone()) { - // Cancel the future so that it won't be launched if it isn't already. - // If it is going to be canceled, make sure CONTAINER_KILLED_ON_REQUEST - // will not be missed if the container is already at KILLING - if (rContainer.cancel(false)) { - if (container.getContainerState() == ContainerState.KILLING) { - dispatcher.getEventHandler().handle( - new ContainerExitEvent(containerId, - ContainerEventType.CONTAINER_KILLED_ON_REQUEST, - Shell.WINDOWS ? ExitCode.FORCE_KILLED.getExitCode() : - ExitCode.TERMINATED.getExitCode(), - "Container terminated before launch.")); - } - } - } // Cleanup a container whether it is running/killed/completed, so that // no sub-processes are alive. try { - rContainerDatum.launcher.cleanupContainer(); + launcher.cleanupContainer(); } catch (IOException e) { LOG.warn("Got exception while cleaning container " + containerId + ". Ignoring."); Modified: hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/localizer/ContainerLocalizer.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/localizer/ContainerLocalizer.java?rev=1532967&r1=1532966&r2=1532967&view=diff ============================================================================== --- hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/localizer/ContainerLocalizer.java (original) +++ hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/localizer/ContainerLocalizer.java Thu Oct 17 05:32:42 2013 @@ -65,7 +65,6 @@ import org.apache.hadoop.yarn.server.nod import org.apache.hadoop.yarn.server.nodemanager.api.protocolrecords.LocalizerStatus; import org.apache.hadoop.yarn.server.nodemanager.api.protocolrecords.ResourceStatusType; import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.security.LocalizerTokenIdentifier; -import org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils; import org.apache.hadoop.yarn.util.ConverterUtils; import org.apache.hadoop.yarn.util.FSDownload; @@ -130,9 +129,12 @@ public class ContainerLocalizer { try { // assume credentials in cwd // TODO: Fix - credFile = lfs.open( - new Path(String.format(TOKEN_FILE_NAME_FMT, localizerId))); + Path tokenPath = + new Path(String.format(TOKEN_FILE_NAME_FMT, localizerId)); + credFile = lfs.open(tokenPath); creds.readTokenStorageStream(credFile); + // Explicitly deleting token file. + lfs.delete(tokenPath, false); } finally { if (credFile != null) { credFile.close(); Modified: hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/localizer/ResourceLocalizationService.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/localizer/ResourceLocalizationService.java?rev=1532967&r1=1532966&r2=1532967&view=diff ============================================================================== --- hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/localizer/ResourceLocalizationService.java (original) +++ hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/localizer/ResourceLocalizationService.java Thu Oct 17 05:32:42 2013 @@ -1017,6 +1017,7 @@ public class ResourceLocalizationService } } if (UserGroupInformation.isSecurityEnabled()) { + credentials = new Credentials(credentials); LocalizerTokenIdentifier id = secretManager.createIdentifier(); Token localizerToken = new Token(id, secretManager); Modified: hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java?rev=1532967&r1=1532966&r2=1532967&view=diff ============================================================================== --- hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java (original) +++ hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java Thu Oct 17 05:32:42 2013 @@ -32,6 +32,7 @@ import java.util.Map.Entry; import java.util.regex.Matcher; import java.util.regex.Pattern; +import com.google.common.annotations.VisibleForTesting; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; @@ -40,6 +41,8 @@ import org.apache.hadoop.yarn.api.record import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor; +import org.apache.hadoop.yarn.util.Clock; +import org.apache.hadoop.yarn.util.SystemClock; public class CgroupsLCEResourcesHandler implements LCEResourcesHandler { @@ -59,8 +62,13 @@ public class CgroupsLCEResourcesHandler private final int CPU_DEFAULT_WEIGHT = 1024; // set by kernel private final Map controllerPaths; // Controller -> path + private long deleteCgroupTimeout; + // package private for testing purposes + Clock clock; + public CgroupsLCEResourcesHandler() { this.controllerPaths = new HashMap(); + clock = new SystemClock(); } @Override @@ -73,7 +81,8 @@ public class CgroupsLCEResourcesHandler return conf; } - public synchronized void init(LinuxContainerExecutor lce) throws IOException { + @VisibleForTesting + void initConfig() throws IOException { this.cgroupPrefix = conf.get(YarnConfiguration. NM_LINUX_CONTAINER_CGROUPS_HIERARCHY, "/hadoop-yarn"); @@ -82,6 +91,9 @@ public class CgroupsLCEResourcesHandler this.cgroupMountPath = conf.get(YarnConfiguration. NM_LINUX_CONTAINER_CGROUPS_MOUNT_PATH, null); + this.deleteCgroupTimeout = conf.getLong( + YarnConfiguration.NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT, + YarnConfiguration.DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT); // remove extra /'s at end or start of cgroupPrefix if (cgroupPrefix.charAt(0) == '/') { cgroupPrefix = cgroupPrefix.substring(1); @@ -91,7 +103,11 @@ public class CgroupsLCEResourcesHandler if (cgroupPrefix.charAt(len - 1) == '/') { cgroupPrefix = cgroupPrefix.substring(0, len - 1); } + } + public void init(LinuxContainerExecutor lce) throws IOException { + initConfig(); + // mount cgroups if requested if (cgroupMount && cgroupMountPath != null) { ArrayList cgroupKVs = new ArrayList(); @@ -158,14 +174,32 @@ public class CgroupsLCEResourcesHandler } } - private void deleteCgroup(String controller, String groupName) { - String path = pathForCgroup(controller, groupName); + @VisibleForTesting + boolean deleteCgroup(String cgroupPath) { + boolean deleted; + + if (LOG.isDebugEnabled()) { + LOG.debug("deleteCgroup: " + cgroupPath); + } - LOG.debug("deleteCgroup: " + path); + long start = clock.getTime(); + do { + deleted = new File(cgroupPath).delete(); + if (!deleted) { + try { + Thread.sleep(20); + } catch (InterruptedException ex) { + // NOP + } + } + } while (!deleted && (clock.getTime() - start) < deleteCgroupTimeout); - if (! new File(path).delete()) { - LOG.warn("Unable to delete cgroup at: " + path); + if (!deleted) { + LOG.warn("Unable to delete cgroup at: " + cgroupPath + + ", tried to delete for " + deleteCgroupTimeout + "ms"); } + + return deleted; } /* @@ -185,21 +219,8 @@ public class CgroupsLCEResourcesHandler } private void clearLimits(ContainerId containerId) { - String containerName = containerId.toString(); - - // Based on testing, ApplicationMaster executables don't terminate until - // a little after the container appears to have finished. Therefore, we - // wait a short bit for the cgroup to become empty before deleting it. - if (containerId.getId() == 1) { - try { - Thread.sleep(500); - } catch (InterruptedException e) { - // not a problem, continue anyway - } - } - if (isCpuWeightEnabled()) { - deleteCgroup(CONTROLLER_CPU, containerName); + deleteCgroup(pathForCgroup(CONTROLLER_CPU, containerId.toString())); } } Modified: hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/NavBlock.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/NavBlock.java?rev=1532967&r1=1532966&r2=1532967&view=diff ============================================================================== --- hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/NavBlock.java (original) +++ hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/NavBlock.java Thu Oct 17 05:32:42 2013 @@ -19,8 +19,9 @@ package org.apache.hadoop.yarn.server.nodemanager.webapp; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.http.HttpConfig; import org.apache.hadoop.yarn.webapp.YarnWebParams; +import org.apache.hadoop.yarn.webapp.util.WebAppUtils; import org.apache.hadoop.yarn.webapp.view.HtmlBlock; import com.google.inject.Inject; @@ -37,8 +38,9 @@ public class NavBlock extends HtmlBlock @Override protected void render(Block html) { - String RMWebAppURL = YarnConfiguration.getRMWebAppURL(this.conf); - html + String RMWebAppURL = + WebAppUtils.getResolvedRMWebAppURLWithScheme(this.conf); + html .div("#nav") .h3()._("ResourceManager")._() .ul() Modified: hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/WebServer.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/WebServer.java?rev=1532967&r1=1532966&r2=1532967&view=diff ============================================================================== --- hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/WebServer.java (original) +++ hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/WebServer.java Thu Oct 17 05:32:42 2013 @@ -22,7 +22,7 @@ import static org.apache.hadoop.yarn.uti import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.http.HttpConfig; import org.apache.hadoop.service.AbstractService; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; @@ -34,6 +34,7 @@ import org.apache.hadoop.yarn.webapp.Gen import org.apache.hadoop.yarn.webapp.WebApp; import org.apache.hadoop.yarn.webapp.WebApps; import org.apache.hadoop.yarn.webapp.YarnWebParams; +import org.apache.hadoop.yarn.webapp.util.WebAppUtils; public class WebServer extends AbstractService { @@ -54,8 +55,8 @@ public class WebServer extends AbstractS @Override protected void serviceStart() throws Exception { - String bindAddress = getConfig().get(YarnConfiguration.NM_WEBAPP_ADDRESS, - YarnConfiguration.DEFAULT_NM_WEBAPP_ADDRESS); + String bindAddress = WebAppUtils.getNMWebAppURLWithoutScheme(getConfig()); + LOG.info("Instantiating NMWebApp at " + bindAddress); try { this.webApp = Modified: hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.c URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.c?rev=1532967&r1=1532966&r2=1532967&view=diff ============================================================================== --- hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.c (original) +++ hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.c Thu Oct 17 05:32:42 2013 @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -406,7 +407,7 @@ static int create_container_directories( const char *container_id, char* const* local_dir, char* const* log_dir, const char *work_dir) { // create dirs as 0750 const mode_t perms = S_IRWXU | S_IRGRP | S_IXGRP; - if (app_id == NULL || container_id == NULL || user == NULL) { + if (app_id == NULL || container_id == NULL || user == NULL || user_detail == NULL || user_detail->pw_name == NULL) { fprintf(LOGFILE, "Either app_id, container_id or the user passed is null.\n"); return -1; @@ -492,6 +493,21 @@ static struct passwd* get_user_info(cons return result; } +int is_whitelisted(const char *user) { + char **whitelist = get_values(ALLOWED_SYSTEM_USERS_KEY); + char **users = whitelist; + if (whitelist != NULL) { + for(; *users; ++users) { + if (strncmp(*users, user, LOGIN_NAME_MAX) == 0) { + free_values(whitelist); + return 1; + } + } + free_values(whitelist); + } + return 0; +} + /** * Is the user a real user account? * Checks: @@ -526,9 +542,9 @@ struct passwd* check_user(const char *us fflush(LOGFILE); return NULL; } - if (user_info->pw_uid < min_uid) { - fprintf(LOGFILE, "Requested user %s has id %d, which is below the " - "minimum allowed %d\n", user, user_info->pw_uid, min_uid); + if (user_info->pw_uid < min_uid && !is_whitelisted(user)) { + fprintf(LOGFILE, "Requested user %s is not whitelisted and has id %d," + "which is below the minimum allowed %d\n", user, user_info->pw_uid, min_uid); fflush(LOGFILE); free(user_info); return NULL; @@ -735,28 +751,11 @@ int initialize_user(const char *user, ch return failed ? INITIALIZE_USER_FAILED : 0; } -/** - * Function to prepare the application directories for the container. - */ -int initialize_app(const char *user, const char *app_id, - const char* nmPrivate_credentials_file, - char* const* local_dirs, char* const* log_roots, - char* const* args) { - if (app_id == NULL || user == NULL) { - fprintf(LOGFILE, "Either app_id is null or the user passed is null.\n"); - return INVALID_ARGUMENT_NUMBER; - } +int create_log_dirs(const char *app_id, char * const * log_dirs) { - // create the user directory on all disks - int result = initialize_user(user, local_dirs); - if (result != 0) { - return result; - } - - ////////////// create the log directories for the app on all disks char* const* log_root; char *any_one_app_log_dir = NULL; - for(log_root=log_roots; *log_root != NULL; ++log_root) { + for(log_root=log_dirs; *log_root != NULL; ++log_root) { char *app_log_dir = get_app_log_directory(*log_root, app_id); if (app_log_dir == NULL) { // try the next one @@ -775,7 +774,33 @@ int initialize_app(const char *user, con return -1; } free(any_one_app_log_dir); - ////////////// End of creating the log directories for the app on all disks + return 0; +} + + +/** + * Function to prepare the application directories for the container. + */ +int initialize_app(const char *user, const char *app_id, + const char* nmPrivate_credentials_file, + char* const* local_dirs, char* const* log_roots, + char* const* args) { + if (app_id == NULL || user == NULL || user_detail == NULL || user_detail->pw_name == NULL) { + fprintf(LOGFILE, "Either app_id is null or the user passed is null.\n"); + return INVALID_ARGUMENT_NUMBER; + } + + // create the user directory on all disks + int result = initialize_user(user, local_dirs); + if (result != 0) { + return result; + } + + // create the log directories for the app on all disks + int log_create_result = create_log_dirs(app_id, log_roots); + if (log_create_result != 0) { + return log_create_result; + } // open up the credentials file int cred_file = open_file_as_nm(nmPrivate_credentials_file); @@ -906,18 +931,34 @@ int launch_container_as_user(const char } } + // create the user directory on all disks + int result = initialize_user(user, local_dirs); + if (result != 0) { + return result; + } + + // initializing log dirs + int log_create_result = create_log_dirs(app_id, log_dirs); + if (log_create_result != 0) { + return log_create_result; + } + // give up root privs if (change_user(user_detail->pw_uid, user_detail->pw_gid) != 0) { exit_code = SETUID_OPER_FAILED; goto cleanup; } + // Create container specific directories as user. If there are no resources + // to localize for this container, app-directories and log-directories are + // also created automatically as part of this call. if (create_container_directories(user, app_id, container_id, local_dirs, log_dirs, work_dir) != 0) { fprintf(LOGFILE, "Could not create container dirs"); goto cleanup; } + // 700 if (copy_file(container_file_source, script_name, script_file_dest,S_IRWXU) != 0) { goto cleanup; Modified: hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.h URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.h?rev=1532967&r1=1532966&r2=1532967&view=diff ============================================================================== --- hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.h (original) +++ hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.h Thu Oct 17 05:32:42 2013 @@ -65,6 +65,7 @@ enum errorcodes { #define CREDENTIALS_FILENAME "container_tokens" #define MIN_USERID_KEY "min.user.id" #define BANNED_USERS_KEY "banned.users" +#define ALLOWED_SYSTEM_USERS_KEY "allowed.system.users" #define TMP_DIR "tmp" extern struct passwd *user_detail; Modified: hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/main.c URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/main.c?rev=1532967&r1=1532966&r2=1532967&view=diff ============================================================================== --- hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/main.c (original) +++ hadoop/common/branches/HDFS-4949/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/main.c Thu Oct 17 05:32:42 2013 @@ -49,7 +49,7 @@ void display_usage(FILE *stream) { "Usage: container-executor --mount-cgroups "\ "hierarchy controller=path...\n"); fprintf(stream, - "Usage: container-executor user command command-args\n"); + "Usage: container-executor user yarn-user command command-args\n"); fprintf(stream, "Commands:\n"); fprintf(stream, " initialize container: %2d appid tokens " \ "nm-local-dirs nm-log-dirs cmd app...\n", INITIALIZE_CONTAINER); @@ -178,18 +178,29 @@ int main(int argc, char **argv) { if (ret != 0) { return ret; } + + // this string is used for building pathnames, the + // process management is done based on the 'user_detail' + // global, which was set by 'set_user()' above + optind = optind + 1; + char *yarn_user_name = argv[optind]; + if (yarn_user_name == NULL) { + fprintf(ERRORFILE, "Invalid yarn user name.\n"); + return INVALID_USER_NAME; + } optind = optind + 1; command = atoi(argv[optind++]); fprintf(LOGFILE, "main : command provided %d\n",command); fprintf(LOGFILE, "main : user is %s\n", user_detail->pw_name); + fprintf(LOGFILE, "main : requested yarn user is %s\n", yarn_user_name); fflush(LOGFILE); switch (command) { case INITIALIZE_CONTAINER: - if (argc < 8) { - fprintf(ERRORFILE, "Too few arguments (%d vs 8) for initialize container\n", + if (argc < 9) { + fprintf(ERRORFILE, "Too few arguments (%d vs 9) for initialize container\n", argc); fflush(ERRORFILE); return INVALID_ARGUMENT_NUMBER; @@ -198,13 +209,13 @@ int main(int argc, char **argv) { cred_file = argv[optind++]; local_dirs = argv[optind++];// good local dirs as a comma separated list log_dirs = argv[optind++];// good log dirs as a comma separated list - exit_code = initialize_app(user_detail->pw_name, app_id, cred_file, + exit_code = initialize_app(yarn_user_name, app_id, cred_file, extract_values(local_dirs), extract_values(log_dirs), argv + optind); break; case LAUNCH_CONTAINER: - if (argc != 12) { - fprintf(ERRORFILE, "Wrong number of arguments (%d vs 12) for launch container\n", + if (argc != 13) { + fprintf(ERRORFILE, "Wrong number of arguments (%d vs 13) for launch container\n", argc); fflush(ERRORFILE); return INVALID_ARGUMENT_NUMBER; @@ -230,7 +241,7 @@ int main(int argc, char **argv) { return INVALID_ARGUMENT_NUMBER; } char** resources_values = extract_values(resources_value); - exit_code = launch_container_as_user(user_detail->pw_name, app_id, + exit_code = launch_container_as_user(yarn_user_name, app_id, container_id, current_dir, script_file, cred_file, pid_file, extract_values(local_dirs), extract_values(log_dirs), resources_key, @@ -239,8 +250,8 @@ int main(int argc, char **argv) { free(resources_value); break; case SIGNAL_CONTAINER: - if (argc != 5) { - fprintf(ERRORFILE, "Wrong number of arguments (%d vs 5) for " \ + if (argc != 6) { + fprintf(ERRORFILE, "Wrong number of arguments (%d vs 6) for " \ "signal container\n", argc); fflush(ERRORFILE); return INVALID_ARGUMENT_NUMBER; @@ -260,12 +271,12 @@ int main(int argc, char **argv) { fflush(ERRORFILE); return INVALID_ARGUMENT_NUMBER; } - exit_code = signal_container_as_user(user_detail->pw_name, container_pid, signal); + exit_code = signal_container_as_user(yarn_user_name, container_pid, signal); } break; case DELETE_AS_USER: dir_to_be_deleted = argv[optind++]; - exit_code= delete_as_user(user_detail->pw_name, dir_to_be_deleted, + exit_code= delete_as_user(yarn_user_name, dir_to_be_deleted, argv + optind); break; default: