Return-Path: X-Original-To: apmail-hadoop-common-commits-archive@www.apache.org Delivered-To: apmail-hadoop-common-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 4B8B318B49 for ; Tue, 24 Nov 2015 22:36:13 +0000 (UTC) Received: (qmail 73566 invoked by uid 500); 24 Nov 2015 22:36:08 -0000 Delivered-To: apmail-hadoop-common-commits-archive@hadoop.apache.org Received: (qmail 73503 invoked by uid 500); 24 Nov 2015 22:36:08 -0000 Mailing-List: contact common-commits-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: common-dev@hadoop.apache.org Delivered-To: mailing list common-commits@hadoop.apache.org Received: (qmail 73494 invoked by uid 99); 24 Nov 2015 22:36:08 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 24 Nov 2015 22:36:08 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id F20CAE0941; Tue, 24 Nov 2015 22:36:07 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: jlowe@apache.org To: common-commits@hadoop.apache.org Message-Id: <40207b1f06a642728d06a3efc5381336@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: hadoop git commit: YARN-4132. Separate configs for nodemanager to resourcemanager connection timeout and retries. Contributed by Chang Li Date: Tue, 24 Nov 2015 22:36:07 +0000 (UTC) Repository: hadoop Updated Branches: refs/heads/trunk f634505d4 -> 4ac6799d4 YARN-4132. Separate configs for nodemanager to resourcemanager connection timeout and retries. Contributed by Chang Li Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/4ac6799d Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/4ac6799d Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/4ac6799d Branch: refs/heads/trunk Commit: 4ac6799d4a8b071e0d367c2d709e84d8ea06942d Parents: f634505 Author: Jason Lowe Authored: Tue Nov 24 22:35:37 2015 +0000 Committer: Jason Lowe Committed: Tue Nov 24 22:35:37 2015 +0000 ---------------------------------------------------------------------- hadoop-yarn-project/CHANGES.txt | 3 + .../hadoop/yarn/conf/YarnConfiguration.java | 18 ++++- .../org/apache/hadoop/yarn/client/RMProxy.java | 39 ++++++++- .../src/main/resources/yarn-default.xml | 20 +++++ .../hadoop/yarn/server/api/ServerRMProxy.java | 22 +++++- .../nodemanager/TestNodeStatusUpdater.java | 83 ++++++++++++++++++++ 6 files changed, 181 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/4ac6799d/hadoop-yarn-project/CHANGES.txt ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 204c338..4483589 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -571,6 +571,9 @@ Release 2.8.0 - UNRELEASED YARN-3980. Plumb resource-utilization info in node heartbeat through to the scheduler. (Inigo Goiri via kasha) + YARN-4132. Separate configs for nodemanager to resourcemanager connection + timeout and retries (Chang Li via jlowe) + OPTIMIZATIONS YARN-3339. TestDockerContainerExecutor should pull a single image and not http://git-wip-us.apache.org/repos/asf/hadoop/blob/4ac6799d/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 18e6082..f493fd3 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -2050,7 +2050,23 @@ public class YarnConfiguration extends Configuration { public static final String YARN_HTTP_POLICY_KEY = YARN_PREFIX + "http.policy"; public static final String YARN_HTTP_POLICY_DEFAULT = HttpConfig.Policy.HTTP_ONLY .name(); - + + /** + * Max time to wait for NM to connection to RM. + * When not set, proxy will fall back to use value of + * RESOURCEMANAGER_CONNECT_MAX_WAIT_MS. + */ + public static final String NM_RESOURCEMANAGER_CONNECT_MAX_WAIT_MS = + YARN_PREFIX + "nodemanager.resourcemanager.connect.max-wait.ms"; + + /** + * Time interval between each NM attempt to connection to RM. + * When not set, proxy will fall back to use value of + * RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS. + */ + public static final String NM_RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS = + YARN_PREFIX + "nodemanager.resourcemanager.connect.retry-interval.ms"; + /** * Node-labels configurations */ http://git-wip-us.apache.org/repos/asf/hadoop/blob/4ac6799d/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/client/RMProxy.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/client/RMProxy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/client/RMProxy.java index 23e1691..3779ce5 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/client/RMProxy.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/client/RMProxy.java @@ -88,7 +88,32 @@ public class RMProxy { YarnConfiguration conf = (configuration instanceof YarnConfiguration) ? (YarnConfiguration) configuration : new YarnConfiguration(configuration); - RetryPolicy retryPolicy = createRetryPolicy(conf); + RetryPolicy retryPolicy = + createRetryPolicy(conf); + return createRMProxy(conf, protocol, instance, retryPolicy); + } + + /** + * Create a proxy for the specified protocol. For non-HA, + * this is a direct connection to the ResourceManager address. When HA is + * enabled, the proxy handles the failover between the ResourceManagers as + * well. + */ + @Private + protected static T createRMProxy(final Configuration configuration, + final Class protocol, RMProxy instance, final long retryTime, + final long retryInterval) throws IOException { + YarnConfiguration conf = (configuration instanceof YarnConfiguration) + ? (YarnConfiguration) configuration + : new YarnConfiguration(configuration); + RetryPolicy retryPolicy = + createRetryPolicy(conf, retryTime, retryInterval); + return createRMProxy(conf, protocol, instance, retryPolicy); + } + + private static T createRMProxy(final YarnConfiguration conf, + final Class protocol, RMProxy instance, RetryPolicy retryPolicy) + throws IOException{ if (HAUtil.isHAEnabled(conf)) { RMFailoverProxyProvider provider = instance.createRMFailoverProxyProvider(conf, protocol); @@ -179,6 +204,18 @@ public class RMProxy { YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS, YarnConfiguration .DEFAULT_RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS); + return createRetryPolicy( + conf, rmConnectWaitMS, rmConnectionRetryIntervalMS); + } + + /** + * Fetch retry policy from Configuration and create the + * retry policy with specified retryTime and retry interval. + */ + private static RetryPolicy createRetryPolicy(Configuration conf, + long retryTime, long retryInterval) { + long rmConnectWaitMS = retryTime; + long rmConnectionRetryIntervalMS = retryInterval; boolean waitForEver = (rmConnectWaitMS == -1); if (!waitForEver) { http://git-wip-us.apache.org/repos/asf/hadoop/blob/4ac6799d/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 997eb8e..9bbdb94 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -1551,6 +1551,26 @@ + Max time to wait for NM to connect to RM. + When not set, proxy will fall back to use value of + yarn.resourcemanager.connect.max-wait.ms. + + yarn.nodemanager.resourcemanager.connect.max-wait.ms + + + + + + Time interval between each NM attempt to connect to RM. + When not set, proxy will fall back to use value of + yarn.resourcemanager.connect.retry-interval.ms. + + yarn.nodemanager.resourcemanager.connect.retry-interval.ms + + + + + Maximum number of proxy connections to cache for node managers. If set to a value greater than zero then the cache is enabled and the NMClient and MRAppMaster will cache the specified number of node manager proxies. http://git-wip-us.apache.org/repos/asf/hadoop/blob/4ac6799d/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/ServerRMProxy.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/ServerRMProxy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/ServerRMProxy.java index 5d4fc46..2d4085f 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/ServerRMProxy.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/ServerRMProxy.java @@ -48,8 +48,26 @@ public class ServerRMProxy extends RMProxy { */ public static T createRMProxy(final Configuration configuration, final Class protocol) throws IOException { - return createRMProxy(configuration, protocol, INSTANCE); -} + long rmConnectWait = + configuration.getLong( + YarnConfiguration.RESOURCEMANAGER_CONNECT_MAX_WAIT_MS, + YarnConfiguration.DEFAULT_RESOURCEMANAGER_CONNECT_MAX_WAIT_MS); + long rmRetryInterval = + configuration.getLong( + YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS, + YarnConfiguration + .DEFAULT_RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS); + long nmRmConnectWait = + configuration.getLong( + YarnConfiguration.NM_RESOURCEMANAGER_CONNECT_MAX_WAIT_MS, + rmConnectWait); + long nmRmRetryInterval = + configuration.getLong( + YarnConfiguration.NM_RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS, + rmRetryInterval); + return createRMProxy(configuration, protocol, INSTANCE, + nmRmConnectWait, nmRmRetryInterval); + } @InterfaceAudience.Private @Override http://git-wip-us.apache.org/repos/asf/hadoop/blob/4ac6799d/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java index e231f1b..90804b8 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java @@ -95,6 +95,7 @@ import org.apache.hadoop.yarn.server.api.records.MasterKey; import org.apache.hadoop.yarn.server.api.records.NodeAction; import org.apache.hadoop.yarn.server.api.records.NodeStatus; import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl; +import org.apache.hadoop.fs.CommonConfigurationKeysPublic; import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext; import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; @@ -486,6 +487,35 @@ public class TestNodeStatusUpdater { } } + private class MyNodeStatusUpdater6 extends NodeStatusUpdaterImpl { + + private final long rmStartIntervalMS; + private final boolean rmNeverStart; + public ResourceTracker resourceTracker; + public MyNodeStatusUpdater6(Context context, Dispatcher dispatcher, + NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics, + long rmStartIntervalMS, boolean rmNeverStart) { + super(context, dispatcher, healthChecker, metrics); + this.rmStartIntervalMS = rmStartIntervalMS; + this.rmNeverStart = rmNeverStart; + } + + @Override + protected void serviceStart() throws Exception { + //record the startup time + super.serviceStart(); + } + + private boolean isTriggered() { + return triggered; + } + + @Override + protected void stopRMProxy() { + return; + } + } + private class MyNodeManager extends NodeManager { private MyNodeStatusUpdater3 nodeStatusUpdater; @@ -1309,6 +1339,59 @@ public class TestNodeStatusUpdater { + "Message from ResourceManager: RM Shutting Down Node"); } + @Test (timeout = 100000) + public void testNMRMConnectionConf() throws Exception { + final long delta = 50000; + final long nmRmConnectionWaitMs = 100; + final long nmRmRetryInterval = 100; + final long connectionWaitMs = -1; + final long connectionRetryIntervalMs = 1000; + //Waiting for rmStartIntervalMS, RM will be started + final long rmStartIntervalMS = 2*1000; + conf.setLong(YarnConfiguration.NM_RESOURCEMANAGER_CONNECT_MAX_WAIT_MS, + nmRmConnectionWaitMs); + conf.setLong( + YarnConfiguration.NM_RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS, + nmRmRetryInterval); + conf.setLong(YarnConfiguration.RESOURCEMANAGER_CONNECT_MAX_WAIT_MS, + connectionWaitMs); + conf.setLong(YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS, + connectionRetryIntervalMs); + conf.setInt(CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY, + 1); + //Test NM try to connect to RM Several times, but finally fail + NodeManagerWithCustomNodeStatusUpdater nmWithUpdater; + nm = nmWithUpdater = new NodeManagerWithCustomNodeStatusUpdater() { + @Override + protected NodeStatusUpdater createUpdater(Context context, + Dispatcher dispatcher, NodeHealthCheckerService healthChecker) { + NodeStatusUpdater nodeStatusUpdater = new MyNodeStatusUpdater6( + context, dispatcher, healthChecker, metrics, + rmStartIntervalMS, true); + return nodeStatusUpdater; + } + }; + nm.init(conf); + long waitStartTime = System.currentTimeMillis(); + try { + nm.start(); + Assert.fail("NM should have failed to start due to RM connect failure"); + } catch(Exception e) { + long t = System.currentTimeMillis(); + long duration = t - waitStartTime; + boolean waitTimeValid = (duration >= nmRmConnectionWaitMs) && + (duration < (connectionWaitMs + delta)); + + if(!waitTimeValid) { + // throw exception if NM doesn't retry long enough + throw new Exception("NM should have tried re-connecting to RM during " + + "period of at least " + connectionWaitMs + " ms, but " + + "stopped retrying within " + (connectionWaitMs + delta) + + " ms: " + e, e); + } + } + } + @Test (timeout = 150000) public void testNMConnectionToRM() throws Exception { final long delta = 50000;