Return-Path: X-Original-To: apmail-hadoop-yarn-commits-archive@minotaur.apache.org Delivered-To: apmail-hadoop-yarn-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id F182111C82 for ; Mon, 7 Jul 2014 04:40:58 +0000 (UTC) Received: (qmail 26036 invoked by uid 500); 7 Jul 2014 04:40:58 -0000 Delivered-To: apmail-hadoop-yarn-commits-archive@hadoop.apache.org Received: (qmail 25987 invoked by uid 500); 7 Jul 2014 04:40:58 -0000 Mailing-List: contact yarn-commits-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: yarn-commits@hadoop.apache.org Delivered-To: mailing list yarn-commits@hadoop.apache.org Received: (qmail 25976 invoked by uid 99); 7 Jul 2014 04:40:58 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 07 Jul 2014 04:40:58 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 07 Jul 2014 04:40:56 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 9C993238896F; Mon, 7 Jul 2014 04:40:36 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1608336 - in /hadoop/common/branches/branch-2/hadoop-yarn-project: ./ hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-... Date: Mon, 07 Jul 2014 04:40:36 -0000 To: yarn-commits@hadoop.apache.org From: jianhe@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20140707044036.9C993238896F@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: jianhe Date: Mon Jul 7 04:40:36 2014 New Revision: 1608336 URL: http://svn.apache.org/r1608336 Log: Merge r1608334 from trunk. YARN-1367. Changed NM to not kill containers on NM resync if RM work-preserving restart is enabled. Contributed by Anubhav Dhoot Modified: hadoop/common/branches/branch-2/hadoop-yarn-project/CHANGES.txt hadoop/common/branches/branch-2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java hadoop/common/branches/branch-2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerResync.java Modified: hadoop/common/branches/branch-2/hadoop-yarn-project/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-yarn-project/CHANGES.txt?rev=1608336&r1=1608335&r2=1608336&view=diff ============================================================================== --- hadoop/common/branches/branch-2/hadoop-yarn-project/CHANGES.txt (original) +++ hadoop/common/branches/branch-2/hadoop-yarn-project/CHANGES.txt Mon Jul 7 04:40:36 2014 @@ -54,6 +54,9 @@ Release 2.5.0 - UNRELEASED YARN-1713. Added get-new-app and submit-app functionality to RM web services. (Varun Vasudev via vinodkv) + YARN-1367. Changed NM to not kill containers on NM resync if RM work-preserving + restart is enabled. (Anubhav Dhoot via jianhe) + IMPROVEMENTS YARN-1479. Invalid NaN values in Hadoop REST API JSON response (Chen He via Modified: hadoop/common/branches/branch-2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java?rev=1608336&r1=1608335&r2=1608336&view=diff ============================================================================== --- hadoop/common/branches/branch-2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java (original) +++ hadoop/common/branches/branch-2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java Mon Jul 7 04:40:36 2014 @@ -84,7 +84,8 @@ public class NodeManager extends Composi private NMStateStoreService nmStore = null; private AtomicBoolean isStopping = new AtomicBoolean(false); - + private boolean rmWorkPreservingRestartEnabled; + public NodeManager() { super(NodeManager.class.getName()); } @@ -173,6 +174,10 @@ public class NodeManager extends Composi conf.setBoolean(Dispatcher.DISPATCHER_EXIT_ON_ERROR_KEY, true); + rmWorkPreservingRestartEnabled = conf.getBoolean(YarnConfiguration + .RM_WORK_PRESERVING_RECOVERY_ENABLED, + YarnConfiguration.DEFAULT_RM_WORK_PRESERVING_RECOVERY_ENABLED); + initAndStartRecoveryStore(conf); NMContainerTokenSecretManager containerTokenSecretManager = @@ -276,8 +281,12 @@ public class NodeManager extends Composi try { LOG.info("Notifying ContainerManager to block new container-requests"); containerManager.setBlockNewContainerRequests(true); - LOG.info("Cleaning up running containers on resync"); - containerManager.cleanupContainersOnNMResync(); + if (!rmWorkPreservingRestartEnabled) { + LOG.info("Cleaning up running containers on resync"); + containerManager.cleanupContainersOnNMResync(); + } else { + LOG.info("Preserving containers on resync"); + } ((NodeStatusUpdaterImpl) nodeStatusUpdater) .rebootNodeStatusUpdaterAndRegisterWithRM(); } catch (YarnRuntimeException e) { Modified: hadoop/common/branches/branch-2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerResync.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerResync.java?rev=1608336&r1=1608335&r2=1608336&view=diff ============================================================================== --- hadoop/common/branches/branch-2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerResync.java (original) +++ hadoop/common/branches/branch-2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerResync.java Mon Jul 7 04:40:36 2014 @@ -59,7 +59,6 @@ import org.apache.hadoop.yarn.server.api import org.apache.hadoop.yarn.server.api.records.NodeAction; import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; -import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils; @@ -85,6 +84,9 @@ public class TestNodeManagerResync { private CyclicBarrier syncBarrier; private AtomicBoolean assertionFailedInThread = new AtomicBoolean(false); private AtomicBoolean isNMShutdownCalled = new AtomicBoolean(false); + private final NodeManagerEvent resyncEvent = + new NodeManagerEvent(NodeManagerEventType.RESYNC); + @Before public void setup() throws UnsupportedFileSystemException { @@ -102,34 +104,56 @@ public class TestNodeManagerResync { assertionFailedInThread.set(false); } - @SuppressWarnings("unchecked") @Test public void testKillContainersOnResync() throws IOException, InterruptedException, YarnException { - NodeManager nm = new TestNodeManager1(); + TestNodeManager1 nm = new TestNodeManager1(false); + + testContainerPreservationOnResyncImpl(nm, false); + } + + @Test + public void testPreserveContainersOnResyncKeepingContainers() throws + IOException, + InterruptedException, YarnException { + TestNodeManager1 nm = new TestNodeManager1(true); + + testContainerPreservationOnResyncImpl(nm, true); + } + + @SuppressWarnings("unchecked") + protected void testContainerPreservationOnResyncImpl(TestNodeManager1 nm, + boolean isWorkPreservingRestartEnabled) + throws IOException, YarnException, InterruptedException { YarnConfiguration conf = createNMConfig(); - nm.init(conf); - nm.start(); - ContainerId cId = TestNodeManagerShutdown.createContainerId(); - TestNodeManagerShutdown.startContainer(nm, cId, localFS, tmpDir, - processStartFile); + conf.setBoolean(YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_ENABLED, + isWorkPreservingRestartEnabled); - Assert.assertEquals(1, ((TestNodeManager1) nm).getNMRegistrationCount()); - nm.getNMDispatcher().getEventHandler(). - handle( new NodeManagerEvent(NodeManagerEventType.RESYNC)); try { - syncBarrier.await(); - } catch (BrokenBarrierException e) { + nm.init(conf); + nm.start(); + ContainerId cId = TestNodeManagerShutdown.createContainerId(); + TestNodeManagerShutdown.startContainer(nm, cId, localFS, tmpDir, + processStartFile); + + nm.setExistingContainerId(cId); + Assert.assertEquals(1, ((TestNodeManager1) nm).getNMRegistrationCount()); + nm.getNMDispatcher().getEventHandler().handle(resyncEvent); + try { + syncBarrier.await(); + } catch (BrokenBarrierException e) { + } + Assert.assertEquals(2, ((TestNodeManager1) nm).getNMRegistrationCount()); + // Only containers should be killed on resync, apps should lie around. + // That way local resources for apps can be used beyond resync without + // relocalization + Assert.assertTrue(nm.getNMContext().getApplications() + .containsKey(cId.getApplicationAttemptId().getApplicationId())); + Assert.assertFalse(assertionFailedInThread.get()); + } + finally { + nm.stop(); } - Assert.assertEquals(2, ((TestNodeManager1) nm).getNMRegistrationCount()); - // Only containers should be killed on resync, apps should lie around. That - // way local resources for apps can be used beyond resync without - // relocalization - Assert.assertTrue(nm.getNMContext().getApplications() - .containsKey(cId.getApplicationAttemptId().getApplicationId())); - Assert.assertFalse(assertionFailedInThread.get()); - - nm.stop(); } // This test tests new container requests are blocked when NM starts from @@ -157,7 +181,7 @@ public class TestNodeManagerResync { Assert.assertFalse(assertionFailedInThread.get()); nm.stop(); } - + @SuppressWarnings("unchecked") @Test(timeout=10000) public void testNMshutdownWhenResyncThrowException() throws IOException, @@ -169,7 +193,7 @@ public class TestNodeManagerResync { Assert.assertEquals(1, ((TestNodeManager3) nm).getNMRegistrationCount()); nm.getNMDispatcher().getEventHandler() .handle(new NodeManagerEvent(NodeManagerEventType.RESYNC)); - + synchronized (isNMShutdownCalled) { while (isNMShutdownCalled.get() == false) { try { @@ -178,7 +202,7 @@ public class TestNodeManagerResync { } } } - + Assert.assertTrue("NM shutdown not called.",isNMShutdownCalled.get()); nm.stop(); } @@ -313,6 +337,16 @@ public class TestNodeManagerResync { class TestNodeManager1 extends NodeManager { private int registrationCount = 0; + private boolean containersShouldBePreserved; + private ContainerId existingCid; + + public TestNodeManager1(boolean containersShouldBePreserved) { + this.containersShouldBePreserved = containersShouldBePreserved; + } + + public void setExistingContainerId(ContainerId cId) { + existingCid = cId; + } @Override protected NodeStatusUpdater createNodeStatusUpdater(Context context, @@ -344,10 +378,23 @@ public class TestNodeManagerResync { .containermanager.container.Container> containers = getNMContext().getContainers(); try { - // ensure that containers are empty before restart nodeStatusUpdater - Assert.assertTrue(containers.isEmpty()); - super.rebootNodeStatusUpdaterAndRegisterWithRM(); - syncBarrier.await(); + try { + if (containersShouldBePreserved) { + Assert.assertFalse(containers.isEmpty()); + Assert.assertTrue(containers.containsKey(existingCid)); + } else { + // ensure that containers are empty before restart nodeStatusUpdater + Assert.assertTrue(containers.isEmpty()); + } + super.rebootNodeStatusUpdaterAndRegisterWithRM(); + } + catch (AssertionError ae) { + ae.printStackTrace(); + assertionFailedInThread.set(true); + } + finally { + syncBarrier.await(); + } } catch (InterruptedException e) { } catch (BrokenBarrierException e) { } catch (AssertionError ae) {