Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 9E11420049D for ; Wed, 9 Aug 2017 17:55:17 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id 9C95E169712; Wed, 9 Aug 2017 15:55:17 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id E69AA16970A for ; Wed, 9 Aug 2017 17:55:16 +0200 (CEST) Received: (qmail 47657 invoked by uid 500); 9 Aug 2017 15:55:15 -0000 Mailing-List: contact commits-help@ambari.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: ambari-dev@ambari.apache.org Delivered-To: mailing list commits@ambari.apache.org Received: (qmail 47503 invoked by uid 99); 9 Aug 2017 15:55:15 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 09 Aug 2017 15:55:15 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id E3A4EF5535; Wed, 9 Aug 2017 15:55:14 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: jluniya@apache.org To: commits@ambari.apache.org Date: Wed, 09 Aug 2017 15:55:19 -0000 Message-Id: <8df1a2b556b046d68ad6e6137a3d579d@git.apache.org> In-Reply-To: <1bd593aa6951467e925c8996f5b2760e@git.apache.org> References: <1bd593aa6951467e925c8996f5b2760e@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [06/52] [abbrv] ambari git commit: AMBARI-21593 : AMS stopped after RU [AMS distributed mode with 2 collectors] (avijayan) archived-at: Wed, 09 Aug 2017 15:55:17 -0000 AMBARI-21593 : AMS stopped after RU [AMS distributed mode with 2 collectors] (avijayan) Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/c7b350b6 Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/c7b350b6 Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/c7b350b6 Branch: refs/heads/branch-feature-AMBARI-14714 Commit: c7b350b678b82bae1c0834744249cb534fed18f1 Parents: 2bab215 Author: Aravindan Vijayan Authored: Mon Jul 31 14:30:27 2017 -0700 Committer: Aravindan Vijayan Committed: Mon Jul 31 14:30:27 2017 -0700 ---------------------------------------------------------------------- .../MetricCollectorHAController.java | 42 +++++++++++++++----- 1 file changed, 32 insertions(+), 10 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/c7b350b6/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/availability/MetricCollectorHAController.java ---------------------------------------------------------------------- diff --git a/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/availability/MetricCollectorHAController.java b/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/availability/MetricCollectorHAController.java index 53e6304..addb14e 100644 --- a/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/availability/MetricCollectorHAController.java +++ b/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/availability/MetricCollectorHAController.java @@ -26,6 +26,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline.MetricsSystemInitializationException; import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline.TimelineMetricConfiguration; import org.apache.helix.HelixAdmin; +import org.apache.helix.HelixException; import org.apache.helix.HelixManager; import org.apache.helix.HelixManagerFactory; import org.apache.helix.InstanceType; @@ -123,20 +124,41 @@ public class MetricCollectorHAController { admin = new ZKHelixAdmin(zkConnectUrl); // create cluster LOG.info("Creating zookeeper cluster node: " + clusterName); - admin.addCluster(clusterName, false); + boolean clusterAdded = admin.addCluster(clusterName, false); + LOG.info("Was cluster added successfully? " + clusterAdded); // Adding host to the cluster - List nodes = Collections.EMPTY_LIST; - try { - nodes = admin.getInstancesInCluster(clusterName); - } catch (ZkNoNodeException ex) { - LOG.warn("Child znode under /" + CLUSTER_NAME + " not found.Recreating the cluster."); - admin.addCluster(clusterName, true); + boolean success = false; + int tries = 5; + int sleepTimeInSeconds = 5; + + for (int i = 0; i < tries && !success; i++) { + try { + List nodes = admin.getInstancesInCluster(clusterName); + if (CollectionUtils.isEmpty(nodes) || !nodes.contains(instanceConfig.getInstanceName())) { + LOG.info("Adding participant instance " + instanceConfig); + admin.addInstance(clusterName, instanceConfig); + success = true; + } + } catch (HelixException | ZkNoNodeException ex) { + LOG.warn("Helix Cluster not yet setup fully."); + if (i < tries - 1) { + LOG.info("Waiting for " + sleepTimeInSeconds + " seconds and retrying."); + TimeUnit.SECONDS.sleep(sleepTimeInSeconds); + } else { + LOG.error(ex); + } + } } - if (CollectionUtils.isEmpty(nodes) || !nodes.contains(instanceConfig.getInstanceName())) { - LOG.info("Adding participant instance " + instanceConfig); - admin.addInstance(clusterName, instanceConfig); + if (!success) { + LOG.info("Trying to create " + clusterName + " again since waiting for the creation did not help."); + admin.addCluster(clusterName, true); + List nodes = admin.getInstancesInCluster(clusterName); + if (CollectionUtils.isEmpty(nodes) || !nodes.contains(instanceConfig.getInstanceName())) { + LOG.info("Adding participant instance " + instanceConfig); + admin.addInstance(clusterName, instanceConfig); + } } // Add a state model