Return-Path: X-Original-To: apmail-hadoop-common-commits-archive@www.apache.org Delivered-To: apmail-hadoop-common-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 9D69618050 for ; Fri, 23 Oct 2015 19:02:06 +0000 (UTC) Received: (qmail 80624 invoked by uid 500); 23 Oct 2015 19:00:32 -0000 Delivered-To: apmail-hadoop-common-commits-archive@hadoop.apache.org Received: (qmail 80121 invoked by uid 500); 23 Oct 2015 19:00:32 -0000 Mailing-List: contact common-commits-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: common-dev@hadoop.apache.org Delivered-To: mailing list common-commits@hadoop.apache.org Received: (qmail 74788 invoked by uid 99); 23 Oct 2015 19:00:29 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 23 Oct 2015 19:00:29 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 5F96CE01F5; Fri, 23 Oct 2015 19:00:29 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: eclark@apache.org To: common-commits@hadoop.apache.org Date: Fri, 23 Oct 2015 19:01:29 -0000 Message-Id: <48d55ca2f593416f9600b91b964cbe42@git.apache.org> In-Reply-To: References: X-Mailer: ASF-Git Admin Mailer Subject: [62/75] [abbrv] hadoop git commit: YARN-4243. Add retry on establishing Zookeeper conenction in EmbeddedElectorService#serviceInit. Contributed by Xuan Gong. YARN-4243. Add retry on establishing Zookeeper conenction in EmbeddedElectorService#serviceInit. Contributed by Xuan Gong. Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/0fce5f9a Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/0fce5f9a Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/0fce5f9a Branch: refs/heads/HADOOP-11890 Commit: 0fce5f9a496925f0d53ea6c14318c9b513de9882 Parents: 960201b Author: Junping Du Authored: Thu Oct 22 13:41:09 2015 -0700 Committer: Junping Du Committed: Thu Oct 22 13:41:09 2015 -0700 ---------------------------------------------------------------------- .../apache/hadoop/ha/ActiveStandbyElector.java | 53 ++++++++++++++++++-- hadoop-yarn-project/CHANGES.txt | 3 ++ .../hadoop/yarn/conf/YarnConfiguration.java | 4 ++ .../src/main/resources/yarn-default.xml | 7 +++ .../resourcemanager/EmbeddedElectorService.java | 9 ++-- 5 files changed, 68 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/0fce5f9a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java ---------------------------------------------------------------------- diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java index fcbcfdf..cb2e081 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java @@ -208,8 +208,49 @@ public class ActiveStandbyElector implements StatCallback, StringCallback { */ public ActiveStandbyElector(String zookeeperHostPorts, int zookeeperSessionTimeout, String parentZnodeName, List acl, - List authInfo, - ActiveStandbyElectorCallback app, int maxRetryNum) throws IOException, + List authInfo, ActiveStandbyElectorCallback app, + int maxRetryNum) throws IOException, HadoopIllegalArgumentException, + KeeperException { + this(zookeeperHostPorts, zookeeperSessionTimeout, parentZnodeName, acl, + authInfo, app, maxRetryNum, true); + } + + /** + * Create a new ActiveStandbyElector object
+ * The elector is created by providing to it the Zookeeper configuration, the + * parent znode under which to create the znode and a reference to the + * callback interface.
+ * The parent znode name must be the same for all service instances and + * different across services.
+ * After the leader has been lost, a new leader will be elected after the + * session timeout expires. Hence, the app must set this parameter based on + * its needs for failure response time. The session timeout must be greater + * than the Zookeeper disconnect timeout and is recommended to be 3X that + * value to enable Zookeeper to retry transient disconnections. Setting a very + * short session timeout may result in frequent transitions between active and + * standby states during issues like network outages/GS pauses. + * + * @param zookeeperHostPorts + * ZooKeeper hostPort for all ZooKeeper servers + * @param zookeeperSessionTimeout + * ZooKeeper session timeout + * @param parentZnodeName + * znode under which to create the lock + * @param acl + * ZooKeeper ACL's + * @param authInfo a list of authentication credentials to add to the + * ZK connection + * @param app + * reference to callback interface object + * @param failFast + * whether need to add the retry when establishing ZK connection. + * @throws IOException + * @throws HadoopIllegalArgumentException + */ + public ActiveStandbyElector(String zookeeperHostPorts, + int zookeeperSessionTimeout, String parentZnodeName, List acl, + List authInfo, ActiveStandbyElectorCallback app, + int maxRetryNum, boolean failFast) throws IOException, HadoopIllegalArgumentException, KeeperException { if (app == null || acl == null || parentZnodeName == null || zookeeperHostPorts == null || zookeeperSessionTimeout <= 0) { @@ -225,8 +266,12 @@ public class ActiveStandbyElector implements StatCallback, StringCallback { zkBreadCrumbPath = znodeWorkingDir + "/" + BREADCRUMB_FILENAME; this.maxRetryNum = maxRetryNum; - // createConnection for future API calls - createConnection(); + // establish the ZK Connection for future API calls + if (failFast) { + createConnection(); + } else { + reEstablishSession(); + } } /** http://git-wip-us.apache.org/repos/asf/hadoop/blob/0fce5f9a/hadoop-yarn-project/CHANGES.txt ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 024255c..9f35307 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -528,6 +528,9 @@ Release 2.8.0 - UNRELEASED YARN-3985. Make ReservationSystem persist state using RMStateStore reservation APIs. (adhoot via asuresh) + YARN-4243. Add retry on establishing Zookeeper conenction in + EmbeddedElectorService#serviceInit. (Xuan Gong via junping_du) + OPTIMIZATIONS YARN-3339. TestDockerContainerExecutor should pull a single image and not http://git-wip-us.apache.org/repos/asf/hadoop/blob/0fce5f9a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 3e89259..913b5df 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -531,6 +531,10 @@ public class YarnConfiguration extends Configuration { public static final int DEFAULT_CLIENT_FAILOVER_RETRIES_ON_SOCKET_TIMEOUTS = 0; + /** number of zookeeper operation retry times in ActiveStandbyElector */ + public static final String RM_HA_FC_ELECTOR_ZK_RETRIES_KEY = RM_HA_PREFIX + + "failover-controller.active-standby-elector.zk.retries"; + //////////////////////////////// // RM state store configs //////////////////////////////// http://git-wip-us.apache.org/repos/asf/hadoop/blob/0fce5f9a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 5dc4590..c6ffe18 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -389,6 +389,13 @@ + When automatic failover is enabled, number of zookeeper + operation retry times in ActiveStandbyElector + yarn.resourcemanager.ha.failover-controller.active-standby-elector.zk.retries + + + + The maximum number of completed applications RM state store keeps, less than or equals to ${yarn.resourcemanager.max-completed-applications}. By default, it equals to ${yarn.resourcemanager.max-completed-applications}. http://git-wip-us.apache.org/repos/asf/hadoop/blob/0fce5f9a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/EmbeddedElectorService.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/EmbeddedElectorService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/EmbeddedElectorService.java index 73bdca0..72327e8 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/EmbeddedElectorService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/EmbeddedElectorService.java @@ -86,11 +86,12 @@ public class EmbeddedElectorService extends AbstractService List zkAcls = RMZKUtils.getZKAcls(conf); List zkAuths = RMZKUtils.getZKAuths(conf); - int maxRetryNum = conf.getInt( - CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_KEY, - CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT); + int maxRetryNum = + conf.getInt(YarnConfiguration.RM_HA_FC_ELECTOR_ZK_RETRIES_KEY, conf + .getInt(CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_KEY, + CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT)); elector = new ActiveStandbyElector(zkQuorum, (int) zkSessionTimeout, - electionZNode, zkAcls, zkAuths, this, maxRetryNum); + electionZNode, zkAcls, zkAuths, this, maxRetryNum, false); elector.ensureParentZNode(); if (!isParentZnodeSafe(clusterId)) {