Return-Path: X-Original-To: apmail-ambari-commits-archive@www.apache.org Delivered-To: apmail-ambari-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 10EC011B4A for ; Tue, 13 May 2014 18:52:31 +0000 (UTC) Received: (qmail 24310 invoked by uid 500); 13 May 2014 17:52:31 -0000 Delivered-To: apmail-ambari-commits-archive@ambari.apache.org Received: (qmail 24280 invoked by uid 500); 13 May 2014 17:52:31 -0000 Mailing-List: contact commits-help@ambari.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: ambari-dev@ambari.apache.org Delivered-To: mailing list commits@ambari.apache.org Received: (qmail 24273 invoked by uid 99); 13 May 2014 17:52:31 -0000 Received: from tyr.zones.apache.org (HELO tyr.zones.apache.org) (140.211.11.114) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 13 May 2014 17:52:31 +0000 Received: by tyr.zones.apache.org (Postfix, from userid 65534) id 35F378B6008; Tue, 13 May 2014 17:52:31 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: aonishuk@apache.org To: commits@ambari.apache.org Message-Id: X-Mailer: ASF-Git Admin Mailer Subject: git commit: AMBARI-5681. Add Nagios alert if HDFS last checkpoint time exceeds threshold (aonishuk) Date: Tue, 13 May 2014 17:52:31 +0000 (UTC) Repository: ambari Updated Branches: refs/heads/branch-1.6.0 ae48f4e3e -> f74e95037 AMBARI-5681. Add Nagios alert if HDFS last checkpoint time exceeds threshold (aonishuk) Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/f74e9503 Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/f74e9503 Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/f74e9503 Branch: refs/heads/branch-1.6.0 Commit: f74e95037e9ac004f9c1177a61fbc79338bf7358 Parents: ae48f4e Author: Andrew Onishuk Authored: Tue May 13 20:52:17 2014 +0300 Committer: Andrew Onishuk Committed: Tue May 13 20:52:17 2014 +0300 ---------------------------------------------------------------------- .../package/files/check_checkpoint_time.py | 112 +++++++++++++++++++ .../package/scripts/nagios_server_config.py | 1 + .../services/NAGIOS/package/scripts/params.py | 3 + .../package/templates/hadoop-commands.cfg.j2 | 5 + .../package/templates/hadoop-services.cfg.j2 | 11 ++ .../stacks/2.0.6/NAGIOS/test_nagios_server.py | 5 + 6 files changed, 137 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/f74e9503/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/check_checkpoint_time.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/check_checkpoint_time.py b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/check_checkpoint_time.py new file mode 100644 index 0000000..ab889d1 --- /dev/null +++ b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/check_checkpoint_time.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python +# +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# + +import os +import optparse +import time +import urllib2 +import json + +CRIT_MESSAGE = "CRITICAL: Last checkpoint time is below acceptable. Checkpoint was done {h}h. {m}m. ago" +WARNING_MESSAGE = "WARNING: Last checkpoint time is below acceptable. Checkpoint was done {h}h. {m}m. ago" +OK_MESSAGE = "OK: Last checkpoint time" +WARNING_JMX_MESSAGE = "WARNING: NameNode JMX not accessible" + +def main(): + + current_time = int(round(time.time() * 1000)) + + parser = optparse.OptionParser() + + parser.add_option("-H", "--host", dest="host", + default="localhost", help="NameNode host") + parser.add_option("-p", "--port", dest="port", + default="50070", help="NameNode jmx port") + parser.add_option("-w", "--warning", dest="warning", + default="200", help="Percent for warning alert") + parser.add_option("-c", "--critical", dest="crit", + default="200", help="Percent for critical alert") + parser.add_option("-t", "--period", dest="period", + default="21600", help="Period time") + parser.add_option("-x", "--txns", dest="txns", + default="1000000", + help="CheckpointNode will create a checkpoint of the namespace every 'dfs.namenode.checkpoint.txns'") + (options, args) = parser.parse_args() + + host = get_available_nn_host(options) + + last_checkpoint_time_qry = "http://{host}:{port}/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem".\ + format(host=host, port=options.port) + last_checkpoint_time = int(get_value_from_jmx(last_checkpoint_time_qry,"LastCheckpointTime")) + + journal_transaction_info_qry = "http://{host}:{port}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".\ + format(host=host, port=options.port) + journal_transaction_info = get_value_from_jmx(journal_transaction_info_qry,"JournalTransactionInfo") + journal_transaction_info_dict = json.loads(journal_transaction_info) + + last_txid = int(journal_transaction_info_dict['LastAppliedOrWrittenTxId']) + most_txid = int(journal_transaction_info_dict['MostRecentCheckpointTxId']) + + delta = (current_time - last_checkpoint_time)/1000 + + if ((last_txid - most_txid) > int(options.txns)) and (float(delta) / int(options.period)*100 >= int(options.crit)): + print CRIT_MESSAGE.format(h=get_time(delta)['h'], m=get_time(delta)['m']) + exit(2) + elif ((last_txid - most_txid) > int(options.txns)) and (float(delta) / int(options.period)*100 >= int(options.warning)): + print WARNING_MESSAGE.format(h=get_time(delta)['h'], m=get_time(delta)['m']) + exit(1) + else: + print OK_MESSAGE + exit(0) + +def get_time(delta): + h = int(delta/3600) + m = int((delta % 3600)/60) + return {'h':h, 'm':m} + +def get_value_from_jmx(qry, property): + try: + response = urllib2.urlopen(qry) + data=response.read() + except Exception: + print WARNING_JMX_MESSAGE + exit(1) + + data_dict = json.loads(data) + return (data_dict["beans"][0][property]) + +def get_available_nn_host(options): + nn_hosts = options.host.split(" ") + for nn_host in nn_hosts: + try: + urllib2.urlopen("http://{host}:{port}/jmx".format(host=nn_host, port=options.port)) + return nn_host + except Exception: + pass + print WARNING_JMX_MESSAGE + exit(1) + +if __name__ == "__main__": + main() + + + http://git-wip-us.apache.org/repos/asf/ambari/blob/f74e9503/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/scripts/nagios_server_config.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/scripts/nagios_server_config.py b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/scripts/nagios_server_config.py index 1f75057..9e66510 100644 --- a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/scripts/nagios_server_config.py +++ b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/scripts/nagios_server_config.py @@ -65,6 +65,7 @@ def nagios_server_config(): nagios_server_check( 'check_namenodes_ha.sh') nagios_server_check( 'check_wrapper.sh') nagios_server_check( 'hdp_nagios_init.php') + nagios_server_check( 'check_checkpoint_time.py' ) def nagios_server_configfile( http://git-wip-us.apache.org/repos/asf/ambari/blob/f74e9503/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/scripts/params.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/scripts/params.py b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/scripts/params.py index e8f0150..935521a 100644 --- a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/scripts/params.py +++ b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/scripts/params.py @@ -73,6 +73,8 @@ supervisor_port = "56431" storm_rest_api_port = "8745" falcon_port = config['configurations']['global']['falcon_port'] ahs_port = get_port_from_url(config['configurations']['yarn-site']['yarn.timeline-service.webapp.address']) +dfs_namenode_checkpoint_period = config['configurations']['hdfs-site']['dfs.namenode.checkpoint.period'] +dfs_namenode_checkpoint_txns = config['configurations']['hdfs-site']['dfs.namenode.checkpoint.txns'] # this is different for HDP1 nn_metrics_property = "FSNamesystem" @@ -160,6 +162,7 @@ _falcon_host = default("/clusterHostInfo/falcon_server_hosts", None) _hbase_rs_hosts = default("/clusterHostInfo/hbase_rs_hosts", _slave_hosts) _hue_server_host = default("/clusterHostInfo/hue_server_host", None) all_hosts = config['clusterHostInfo']['all_hosts'] +nn_hosts_string = " ".join(namenode_host) hostgroup_defs = { http://git-wip-us.apache.org/repos/asf/ambari/blob/f74e9503/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-commands.cfg.j2 ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-commands.cfg.j2 b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-commands.cfg.j2 index da37d73..3d53b2b 100644 --- a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-commands.cfg.j2 +++ b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-commands.cfg.j2 @@ -135,4 +135,9 @@ define command{ command_name check_tcp_wrapper command_line $USER1$/check_wrapper.sh $USER1$/check_tcp -H $HOSTADDRESS$ -p $ARG1$ $ARG2$ } + +define command{ + command_name check_checkpoint_time + command_line python $USER1$/check_checkpoint_time.py -H "$ARG1$" -p $ARG2$ -w $ARG3$ -c $ARG4$ -t $ARG5$ -x $ARG6$ +} http://git-wip-us.apache.org/repos/asf/ambari/blob/f74e9503/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-services.cfg.j2 ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-services.cfg.j2 b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-services.cfg.j2 index b77e77b..94fde2b 100644 --- a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-services.cfg.j2 +++ b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-services.cfg.j2 @@ -409,6 +409,17 @@ define service { {% endfor %} define service { + host_name {{namenode_host[0]}} + use hadoop-service + service_description NAMENODE::Last checkpoint time + servicegroups HDFS + check_command check_checkpoint_time!{{ nn_hosts_string }}!{{ namenode_port }}!200!200!{{ dfs_namenode_checkpoint_period }}!{{dfs_namenode_checkpoint_txns}} + normal_check_interval 0.5 + retry_check_interval 0.25 + max_check_attempts 3 +} + +define service { hostgroup_name nagios-server use hadoop-service service_description HDFS::Blocks health http://git-wip-us.apache.org/repos/asf/ambari/blob/f74e9503/ambari-server/src/test/python/stacks/2.0.6/NAGIOS/test_nagios_server.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/test/python/stacks/2.0.6/NAGIOS/test_nagios_server.py b/ambari-server/src/test/python/stacks/2.0.6/NAGIOS/test_nagios_server.py index 0839995..75e1839 100644 --- a/ambari-server/src/test/python/stacks/2.0.6/NAGIOS/test_nagios_server.py +++ b/ambari-server/src/test/python/stacks/2.0.6/NAGIOS/test_nagios_server.py @@ -242,6 +242,11 @@ class TestNagiosServer(RMFTestCase): content=StaticFile('hdp_nagios_init.php'), mode=0755 ) + self.assertResourceCalled('File', + '/usr/lib64/nagios/plugins/check_checkpoint_time.py', + content=StaticFile('check_checkpoint_time.py'), + mode=0755 + ) self.assertResourceCalled('Execute', 'htpasswd2 -c -b /etc/nagios/htpasswd.users nagiosadmin \'!`"\'"\'"\' 1\'', not_if="grep nagiosadmin /etc/nagios/htpasswd.users"