Return-Path: X-Original-To: apmail-hadoop-yarn-commits-archive@minotaur.apache.org Delivered-To: apmail-hadoop-yarn-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 5112E10BCE for ; Tue, 1 Oct 2013 00:18:32 +0000 (UTC) Received: (qmail 67543 invoked by uid 500); 1 Oct 2013 00:18:32 -0000 Delivered-To: apmail-hadoop-yarn-commits-archive@hadoop.apache.org Received: (qmail 67518 invoked by uid 500); 1 Oct 2013 00:18:32 -0000 Mailing-List: contact yarn-commits-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: yarn-commits@hadoop.apache.org Delivered-To: mailing list yarn-commits@hadoop.apache.org Received: (qmail 67510 invoked by uid 99); 1 Oct 2013 00:18:32 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 01 Oct 2013 00:18:32 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 01 Oct 2013 00:18:30 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 5E28F238890B; Tue, 1 Oct 2013 00:18:10 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1527827 - in /hadoop/common/trunk/hadoop-yarn-project: ./ hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ hadoop-yarn/hadoop-yarn-server/hadoo... Date: Tue, 01 Oct 2013 00:18:10 -0000 To: yarn-commits@hadoop.apache.org From: vinodkv@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20131001001810.5E28F238890B@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: vinodkv Date: Tue Oct 1 00:18:09 2013 New Revision: 1527827 URL: http://svn.apache.org/r1527827 Log: YARN-1070. Fixed race conditions in NodeManager during container-kill. Contributed by Zhijie Shen. Modified: hadoop/common/trunk/hadoop-yarn-project/CHANGES.txt hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java Modified: hadoop/common/trunk/hadoop-yarn-project/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-yarn-project/CHANGES.txt?rev=1527827&r1=1527826&r2=1527827&view=diff ============================================================================== --- hadoop/common/trunk/hadoop-yarn-project/CHANGES.txt (original) +++ hadoop/common/trunk/hadoop-yarn-project/CHANGES.txt Tue Oct 1 00:18:09 2013 @@ -101,7 +101,11 @@ Release 2.1.2 - UNRELEASED YARN-1221. With Fair Scheduler, reserved MB reported in RM web UI increases indefinitely (Siqi Li via Sandy Ryza) - YARN-1247. test-container-executor has gotten out of sync with the changes to container-executor. (rvs via tucu) + YARN-1247. test-container-executor has gotten out of sync with the changes to + container-executor. (rvs via tucu) + + YARN-1070. Fixed race conditions in NodeManager during container-kill. + (Zhijie Shen via vinodkv) Release 2.1.1-beta - 2013-09-23 Modified: hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java?rev=1527827&r1=1527826&r2=1527827&view=diff ============================================================================== --- hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java (original) +++ hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java Tue Oct 1 00:18:09 2013 @@ -68,6 +68,7 @@ import org.apache.hadoop.yarn.server.nod import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEventType; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerExitEvent; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState; import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer; import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService; import org.apache.hadoop.yarn.server.nodemanager.util.ProcessIdFileReader; @@ -133,10 +134,22 @@ public class ContainerLaunch implements final List command = launchContext.getCommands(); int ret = -1; + // CONTAINER_KILLED_ON_REQUEST should not be missed if the container + // is already at KILLING + if (container.getContainerState() == ContainerState.KILLING) { + dispatcher.getEventHandler().handle( + new ContainerExitEvent(containerID, + ContainerEventType.CONTAINER_KILLED_ON_REQUEST, + Shell.WINDOWS ? ExitCode.FORCE_KILLED.getExitCode() : + ExitCode.TERMINATED.getExitCode(), + "Container terminated before launch.")); + return 0; + } + try { localResources = container.getLocalizedResources(); if (localResources == null) { - RPCUtil.getRemoteException( + throw RPCUtil.getRemoteException( "Unable to get local resources when Container " + containerID + " is at " + container.getContainerState()); } Modified: hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java?rev=1527827&r1=1527826&r2=1527827&view=diff ============================================================================== --- hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java (original) +++ hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java Tue Oct 1 00:18:09 2013 @@ -75,20 +75,9 @@ public class ContainersLauncher extends new ThreadFactoryBuilder() .setNameFormat("ContainersLauncher #%d") .build()); - private final Map running = - Collections.synchronizedMap(new HashMap()); - - private static final class RunningContainer { - public RunningContainer(Future submit, - ContainerLaunch launcher) { - this.runningcontainer = submit; - this.launcher = launcher; - } - - Future runningcontainer; - ContainerLaunch launcher; - } - + @VisibleForTesting + public final Map running = + Collections.synchronizedMap(new HashMap()); public ContainersLauncher(Context context, Dispatcher dispatcher, ContainerExecutor exec, LocalDirsHandlerService dirsHandler, @@ -133,38 +122,20 @@ public class ContainersLauncher extends ContainerLaunch launch = new ContainerLaunch(context, getConfig(), dispatcher, exec, app, event.getContainer(), dirsHandler, containerManager); - running.put(containerId, - new RunningContainer(containerLauncher.submit(launch), - launch)); + containerLauncher.submit(launch); + running.put(containerId, launch); break; case CLEANUP_CONTAINER: - RunningContainer rContainerDatum = running.remove(containerId); - if (rContainerDatum == null) { + ContainerLaunch launcher = running.remove(containerId); + if (launcher == null) { // Container not launched. So nothing needs to be done. return; } - Future rContainer = rContainerDatum.runningcontainer; - if (rContainer != null - && !rContainer.isDone()) { - // Cancel the future so that it won't be launched if it isn't already. - // If it is going to be canceled, make sure CONTAINER_KILLED_ON_REQUEST - // will not be missed if the container is already at KILLING - if (rContainer.cancel(false)) { - if (container.getContainerState() == ContainerState.KILLING) { - dispatcher.getEventHandler().handle( - new ContainerExitEvent(containerId, - ContainerEventType.CONTAINER_KILLED_ON_REQUEST, - Shell.WINDOWS ? ExitCode.FORCE_KILLED.getExitCode() : - ExitCode.TERMINATED.getExitCode(), - "Container terminated before launch.")); - } - } - } // Cleanup a container whether it is running/killed/completed, so that // no sub-processes are alive. try { - rContainerDatum.launcher.cleanupContainer(); + launcher.cleanupContainer(); } catch (IOException e) { LOG.warn("Got exception while cleaning container " + containerId + ". Ignoring."); Modified: hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java?rev=1527827&r1=1527826&r2=1527827&view=diff ============================================================================== --- hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java (original) +++ hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java Tue Oct 1 00:18:09 2013 @@ -65,6 +65,7 @@ import org.apache.hadoop.yarn.conf.YarnC import org.apache.hadoop.yarn.event.DrainDispatcher; import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.security.ContainerTokenIdentifier; +import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor; import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.containermanager.AuxServicesEvent; @@ -72,6 +73,7 @@ import org.apache.hadoop.yarn.server.nod import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationEventType; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch; import org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainersLauncher; import org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainersLauncherEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainersLauncherEventType; @@ -296,8 +298,7 @@ public class TestContainer { wc.launchContainer(); reset(wc.localizerBus); wc.killContainer(); - assertEquals(ContainerState.CONTAINER_CLEANEDUP_AFTER_KILL, - wc.c.getContainerState()); + assertEquals(ContainerState.KILLING, wc.c.getContainerState()); assertNull(wc.c.getLocalizedResources()); wc.containerKilledOnRequest(); @@ -330,14 +331,18 @@ public class TestContainer { } @Test - public void testKillOnLocalized() throws Exception { + public void testKillOnLocalizedWhenContainerNotLaunched() throws Exception { WrappedContainer wc = null; try { wc = new WrappedContainer(17, 314159265358979L, 4344, "yak"); wc.initContainer(); wc.localizeResources(); assertEquals(ContainerState.LOCALIZED, wc.c.getContainerState()); + ContainerLaunch launcher = wc.launcher.running.get(wc.c.getContainerId()); wc.killContainer(); + assertEquals(ContainerState.KILLING, wc.c.getContainerState()); + launcher.call(); + wc.drainDispatcherEvents(); assertEquals(ContainerState.CONTAINER_CLEANEDUP_AFTER_KILL, wc.c.getContainerState()); assertNull(wc.c.getLocalizedResources()); @@ -348,6 +353,31 @@ public class TestContainer { } } } + + @Test + public void testKillOnLocalizedWhenContainerLaunched() throws Exception { + WrappedContainer wc = null; + try { + wc = new WrappedContainer(17, 314159265358979L, 4344, "yak"); + wc.initContainer(); + wc.localizeResources(); + assertEquals(ContainerState.LOCALIZED, wc.c.getContainerState()); + ContainerLaunch launcher = wc.launcher.running.get(wc.c.getContainerId()); + launcher.call(); + wc.drainDispatcherEvents(); + assertEquals(ContainerState.EXITED_WITH_FAILURE, + wc.c.getContainerState()); + wc.killContainer(); + assertEquals(ContainerState.EXITED_WITH_FAILURE, + wc.c.getContainerState()); + assertNull(wc.c.getLocalizedResources()); + verifyCleanupCall(wc); + } finally { + if (wc != null) { + wc.finished(); + } + } + } @Test public void testResourceLocalizedOnLocalizationFailed() throws Exception { @@ -472,12 +502,10 @@ public class TestContainer { wc.initContainer(); wc.localizeResources(); wc.killContainer(); - assertEquals(ContainerState.CONTAINER_CLEANEDUP_AFTER_KILL, - wc.c.getContainerState()); + assertEquals(ContainerState.KILLING, wc.c.getContainerState()); assertNull(wc.c.getLocalizedResources()); wc.launchContainer(); - assertEquals(ContainerState.CONTAINER_CLEANEDUP_AFTER_KILL, - wc.c.getContainerState()); + assertEquals(ContainerState.KILLING, wc.c.getContainerState()); assertNull(wc.c.getLocalizedResources()); wc.containerKilledOnRequest(); verifyCleanupCall(wc); @@ -650,7 +678,9 @@ public class TestContainer { Context context = mock(Context.class); when(context.getApplications()).thenReturn( new ConcurrentHashMap()); - launcher = new ContainersLauncher(context, dispatcher, null, null, null); + ContainerExecutor executor = mock(ContainerExecutor.class); + launcher = + new ContainersLauncher(context, dispatcher, executor, null, null); // create a mock ExecutorService, which will not really launch // ContainerLaunch at all. launcher.containerLauncher = mock(ExecutorService.class); Modified: hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java?rev=1527827&r1=1527826&r2=1527827&view=diff ============================================================================== --- hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java (original) +++ hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java Tue Oct 1 00:18:09 2013 @@ -682,9 +682,8 @@ public class TestContainerLaunch extends ContainerStatus containerStatus = containerManager.getContainerStatuses(gcsRequest) .getContainerStatuses().get(0); - int expectedExitCode = Shell.WINDOWS ? ExitCode.FORCE_KILLED.getExitCode() : - ExitCode.TERMINATED.getExitCode(); - Assert.assertEquals(expectedExitCode, containerStatus.getExitStatus()); + Assert.assertEquals(ExitCode.FORCE_KILLED.getExitCode(), + containerStatus.getExitStatus()); // Now verify the contents of the file. Script generates a message when it // receives a sigterm so we look for that. We cannot perform this check on