Return-Path: X-Original-To: apmail-hadoop-common-commits-archive@www.apache.org Delivered-To: apmail-hadoop-common-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 2A90518AFA for ; Thu, 8 Oct 2015 16:40:13 +0000 (UTC) Received: (qmail 44408 invoked by uid 500); 8 Oct 2015 16:40:12 -0000 Delivered-To: apmail-hadoop-common-commits-archive@hadoop.apache.org Received: (qmail 44347 invoked by uid 500); 8 Oct 2015 16:40:12 -0000 Mailing-List: contact common-commits-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: common-dev@hadoop.apache.org Delivered-To: mailing list common-commits@hadoop.apache.org Received: (qmail 44337 invoked by uid 99); 8 Oct 2015 16:40:12 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 08 Oct 2015 16:40:12 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id B3528E0BC6; Thu, 8 Oct 2015 16:40:12 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: jlowe@apache.org To: common-commits@hadoop.apache.org Message-Id: <194375877e344a1ba7047d4cf06ac499@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: hadoop git commit: YARN-3896. RMNode transitioned from RUNNING to REBOOTED because its response id has not been reset synchronously. (Jun Gong via rohithsharmaks) (cherry picked from commit feaf0349949e831ce3f25814c1bbff52f17bfe8f) Date: Thu, 8 Oct 2015 16:40:12 +0000 (UTC) Repository: hadoop Updated Branches: refs/heads/branch-2.6 528b809d2 -> ac865de72 YARN-3896. RMNode transitioned from RUNNING to REBOOTED because its response id has not been reset synchronously. (Jun Gong via rohithsharmaks) (cherry picked from commit feaf0349949e831ce3f25814c1bbff52f17bfe8f) Conflicts: hadoop-yarn-project/CHANGES.txt Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/ac865de7 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/ac865de7 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/ac865de7 Branch: refs/heads/branch-2.6 Commit: ac865de725948569932d2e054703f6ba866484f7 Parents: 528b809 Author: Jason Lowe Authored: Thu Oct 8 16:39:46 2015 +0000 Committer: Jason Lowe Committed: Thu Oct 8 16:39:46 2015 +0000 ---------------------------------------------------------------------- .../hadoop/yarn/sls/nodemanager/NodeInfo.java | 3 ++ .../yarn/sls/scheduler/RMNodeWrapper.java | 5 +++ hadoop-yarn-project/CHANGES.txt | 3 ++ .../resourcemanager/ResourceTrackerService.java | 2 + .../server/resourcemanager/rmnode/RMNode.java | 7 +++- .../resourcemanager/rmnode/RMNodeImpl.java | 15 +++++--- .../yarn/server/resourcemanager/MockNodes.java | 4 ++ .../resourcetracker/TestNMReconnect.java | 39 ++++++++++++++++++++ 8 files changed, 72 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/ac865de7/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/nodemanager/NodeInfo.java ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/nodemanager/NodeInfo.java b/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/nodemanager/NodeInfo.java index ee6eb7b..dbea90f 100644 --- a/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/nodemanager/NodeInfo.java +++ b/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/nodemanager/NodeInfo.java @@ -143,6 +143,9 @@ public class NodeInfo { return null; } + public void resetLastNodeHeartBeatResponse() { + } + public List pullContainerUpdates() { ArrayList list = new ArrayList(); http://git-wip-us.apache.org/repos/asf/hadoop/blob/ac865de7/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/scheduler/RMNodeWrapper.java ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/scheduler/RMNodeWrapper.java b/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/scheduler/RMNodeWrapper.java index b64be1b..356b8bd 100644 --- a/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/scheduler/RMNodeWrapper.java +++ b/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/scheduler/RMNodeWrapper.java @@ -130,6 +130,11 @@ public class RMNodeWrapper implements RMNode { } @Override + public void resetLastNodeHeartBeatResponse() { + node.getLastNodeHeartBeatResponse().setResponseId(0); + } + + @Override @SuppressWarnings("unchecked") public List pullContainerUpdates() { List list = Collections.EMPTY_LIST; http://git-wip-us.apache.org/repos/asf/hadoop/blob/ac865de7/hadoop-yarn-project/CHANGES.txt ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 5fc76b3..d4dd07d 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -42,6 +42,9 @@ Release 2.6.2 - UNRELEASED YARN-3194. RM should handle NMContainerStatuses sent by NM while registering if NM is Reconnected node (Rohith via jlowe) + YARN-3896. RMNode transitioned from RUNNING to REBOOTED because its response id + has not been reset synchronously. (Jun Gong via rohithsharmaks) + Release 2.6.1 - 2015-09-23 INCOMPATIBLE CHANGES http://git-wip-us.apache.org/repos/asf/hadoop/blob/ac865de7/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java index 29a6920..1352cc5 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java @@ -312,6 +312,8 @@ public class ResourceTrackerService extends AbstractService implements } else { LOG.info("Reconnect from the node at: " + host); this.nmLivelinessMonitor.unregister(nodeId); + // Reset heartbeat ID since node just restarted. + oldNode.resetLastNodeHeartBeatResponse(); this.rmContext .getDispatcher() .getEventHandler() http://git-wip-us.apache.org/repos/asf/hadoop/blob/ac865de7/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNode.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNode.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNode.java index 95eeaf6..ed6875b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNode.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNode.java @@ -127,7 +127,12 @@ public interface RMNode { public void updateNodeHeartbeatResponseForCleanup(NodeHeartbeatResponse response); public NodeHeartbeatResponse getLastNodeHeartBeatResponse(); - + + /** + * Reset lastNodeHeartbeatResponse's ID to 0. + */ + void resetLastNodeHeartBeatResponse(); + /** * Get and clear the list of containerUpdates accumulated across NM * heartbeats. http://git-wip-us.apache.org/repos/asf/hadoop/blob/ac865de7/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java index 694cd1a..d2fe991 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java @@ -408,6 +408,16 @@ public class RMNodeImpl implements RMNode, EventHandler { } } + @Override + public void resetLastNodeHeartBeatResponse() { + this.writeLock.lock(); + try { + latestNodeHeartBeatResponse.setResponseId(0); + } finally { + this.writeLock.unlock(); + } + } + public void handle(RMNodeEvent event) { LOG.debug("Processing " + event.getNodeId() + " of type " + event.getType()); try { @@ -567,8 +577,6 @@ public class RMNodeImpl implements RMNode, EventHandler { new NodeRemovedSchedulerEvent(rmNode)); if (rmNode.getHttpPort() == newNode.getHttpPort()) { - // Reset heartbeat ID since node just restarted. - rmNode.getLastNodeHeartBeatResponse().setResponseId(0); if (!rmNode.getTotalCapability().equals( newNode.getTotalCapability())) { rmNode.totalCapability = newNode.getTotalCapability(); @@ -604,9 +612,6 @@ public class RMNodeImpl implements RMNode, EventHandler { handleNMContainerStatus(reconnectEvent.getNMContainerStatuses(), rmNode); - // Reset heartbeat ID since node just restarted. - rmNode.getLastNodeHeartBeatResponse().setResponseId(0); - for (ApplicationId appId : reconnectEvent.getRunningApplications()) { handleRunningAppOnNode(rmNode, rmNode.context, appId, rmNode.nodeId); } http://git-wip-us.apache.org/repos/asf/hadoop/blob/ac865de7/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNodes.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNodes.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNodes.java index 278c151..6c98524 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNodes.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNodes.java @@ -186,6 +186,10 @@ public class MockNodes { } @Override + public void resetLastNodeHeartBeatResponse() { + } + + @Override public String getNodeManagerVersion() { return null; } http://git-wip-us.apache.org/repos/asf/hadoop/blob/ac865de7/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestNMReconnect.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestNMReconnect.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestNMReconnect.java index b525efc..dce3d06 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestNMReconnect.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestNMReconnect.java @@ -21,6 +21,11 @@ package org.apache.hadoop.yarn.server.resourcemanager.resourcetracker; import java.util.ArrayList; import java.util.List; +import org.apache.hadoop.yarn.api.records.NodeState; +import org.apache.hadoop.yarn.event.DrainDispatcher; +import org.apache.hadoop.yarn.server.resourcemanager.MockNM; +import org.apache.hadoop.yarn.server.resourcemanager.MockRM; +import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; import org.junit.Assert; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.yarn.api.records.NodeId; @@ -189,4 +194,38 @@ public class TestNMReconnect { nlm.stop(); scheduler.stop(); } + + @Test(timeout = 10000) + public void testRMNodeStatusAfterReconnect() throws Exception { + // The node(127.0.0.1:1234) reconnected with RM. When it registered with + // RM, RM set its lastNodeHeartbeatResponse's id to 0 asynchronously. But + // the node's heartbeat come before RM succeeded setting the id to 0. + final DrainDispatcher dispatcher = new DrainDispatcher(); + MockRM rm = new MockRM(){ + @Override + protected Dispatcher createDispatcher() { + return dispatcher; + } + }; + rm.start(); + MockNM nm1 = + new MockNM("127.0.0.1:1234", 15120, rm.getResourceTrackerService()); + nm1.registerNode(); + int i = 0; + while(i < 3) { + nm1.nodeHeartbeat(true); + dispatcher.await(); + i++; + } + + MockNM nm2 = + new MockNM("127.0.0.1:1234", 15120, rm.getResourceTrackerService()); + nm2.registerNode(); + RMNode rmNode = rm.getRMContext().getRMNodes().get(nm2.getNodeId()); + nm2.nodeHeartbeat(true); + dispatcher.await(); + Assert.assertEquals("Node is Not in Running state.", NodeState.RUNNING, + rmNode.getState()); + rm.stop(); + } }