ambari-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From aonis...@apache.org
Subject git commit: AMBARI-5681. Add Nagios alert if HDFS last checkpoint time exceeds threshold (aonishuk)
Date Tue, 13 May 2014 17:52:31 GMT
Repository: ambari
Updated Branches:
  refs/heads/branch-1.6.0 ae48f4e3e -> f74e95037


AMBARI-5681. Add Nagios alert if HDFS last checkpoint time exceeds threshold (aonishuk)


Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/f74e9503
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/f74e9503
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/f74e9503

Branch: refs/heads/branch-1.6.0
Commit: f74e95037e9ac004f9c1177a61fbc79338bf7358
Parents: ae48f4e
Author: Andrew Onishuk <aonishuk@hortonworks.com>
Authored: Tue May 13 20:52:17 2014 +0300
Committer: Andrew Onishuk <aonishuk@hortonworks.com>
Committed: Tue May 13 20:52:17 2014 +0300

----------------------------------------------------------------------
 .../package/files/check_checkpoint_time.py      | 112 +++++++++++++++++++
 .../package/scripts/nagios_server_config.py     |   1 +
 .../services/NAGIOS/package/scripts/params.py   |   3 +
 .../package/templates/hadoop-commands.cfg.j2    |   5 +
 .../package/templates/hadoop-services.cfg.j2    |  11 ++
 .../stacks/2.0.6/NAGIOS/test_nagios_server.py   |   5 +
 6 files changed, 137 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ambari/blob/f74e9503/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/check_checkpoint_time.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/check_checkpoint_time.py
b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/check_checkpoint_time.py
new file mode 100644
index 0000000..ab889d1
--- /dev/null
+++ b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/check_checkpoint_time.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python
+#
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#
+
+import os
+import optparse
+import time
+import urllib2
+import json
+
+CRIT_MESSAGE = "CRITICAL: Last checkpoint time is below acceptable. Checkpoint was done {h}h.
{m}m. ago"
+WARNING_MESSAGE = "WARNING: Last checkpoint time is below acceptable. Checkpoint was done
{h}h. {m}m. ago"
+OK_MESSAGE = "OK: Last checkpoint time"
+WARNING_JMX_MESSAGE = "WARNING: NameNode JMX not accessible"
+
+def main():
+
+  current_time = int(round(time.time() * 1000))
+
+  parser = optparse.OptionParser()
+
+  parser.add_option("-H", "--host", dest="host",
+                    default="localhost", help="NameNode host")
+  parser.add_option("-p", "--port", dest="port",
+                    default="50070", help="NameNode jmx port")
+  parser.add_option("-w", "--warning", dest="warning",
+                    default="200", help="Percent for warning alert")
+  parser.add_option("-c", "--critical", dest="crit",
+                    default="200", help="Percent for critical alert")
+  parser.add_option("-t", "--period", dest="period",
+                    default="21600", help="Period time")
+  parser.add_option("-x", "--txns", dest="txns",
+                    default="1000000",
+                    help="CheckpointNode will create a checkpoint of the namespace every
'dfs.namenode.checkpoint.txns'")
+  (options, args) = parser.parse_args()
+
+  host = get_available_nn_host(options)
+
+  last_checkpoint_time_qry = "http://{host}:{port}/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem".\
+    format(host=host, port=options.port)
+  last_checkpoint_time = int(get_value_from_jmx(last_checkpoint_time_qry,"LastCheckpointTime"))
+
+  journal_transaction_info_qry = "http://{host}:{port}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".\
+    format(host=host, port=options.port)
+  journal_transaction_info = get_value_from_jmx(journal_transaction_info_qry,"JournalTransactionInfo")
+  journal_transaction_info_dict = json.loads(journal_transaction_info)
+
+  last_txid = int(journal_transaction_info_dict['LastAppliedOrWrittenTxId'])
+  most_txid = int(journal_transaction_info_dict['MostRecentCheckpointTxId'])
+
+  delta = (current_time - last_checkpoint_time)/1000
+
+  if ((last_txid - most_txid) > int(options.txns)) and (float(delta) / int(options.period)*100
>= int(options.crit)):
+    print CRIT_MESSAGE.format(h=get_time(delta)['h'], m=get_time(delta)['m'])
+    exit(2)
+  elif ((last_txid - most_txid) > int(options.txns)) and (float(delta) / int(options.period)*100
>= int(options.warning)):
+    print WARNING_MESSAGE.format(h=get_time(delta)['h'], m=get_time(delta)['m'])
+    exit(1)
+  else:
+    print OK_MESSAGE
+    exit(0)
+
+def get_time(delta):
+  h = int(delta/3600)
+  m = int((delta % 3600)/60)
+  return {'h':h, 'm':m}
+
+def get_value_from_jmx(qry, property):
+  try:
+    response = urllib2.urlopen(qry)
+    data=response.read()
+  except Exception:
+    print WARNING_JMX_MESSAGE
+    exit(1)
+
+  data_dict = json.loads(data)
+  return (data_dict["beans"][0][property])
+
+def get_available_nn_host(options):
+  nn_hosts = options.host.split(" ")
+  for nn_host in nn_hosts:
+    try:
+      urllib2.urlopen("http://{host}:{port}/jmx".format(host=nn_host, port=options.port))
+      return nn_host
+    except Exception:
+      pass
+  print WARNING_JMX_MESSAGE
+  exit(1)
+
+if __name__ == "__main__":
+  main()
+
+
+

http://git-wip-us.apache.org/repos/asf/ambari/blob/f74e9503/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/scripts/nagios_server_config.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/scripts/nagios_server_config.py
b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/scripts/nagios_server_config.py
index 1f75057..9e66510 100644
--- a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/scripts/nagios_server_config.py
+++ b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/scripts/nagios_server_config.py
@@ -65,6 +65,7 @@ def nagios_server_config():
   nagios_server_check( 'check_namenodes_ha.sh')
   nagios_server_check( 'check_wrapper.sh')
   nagios_server_check( 'hdp_nagios_init.php')
+  nagios_server_check( 'check_checkpoint_time.py' )
 
 
 def nagios_server_configfile(

http://git-wip-us.apache.org/repos/asf/ambari/blob/f74e9503/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/scripts/params.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/scripts/params.py
b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/scripts/params.py
index e8f0150..935521a 100644
--- a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/scripts/params.py
+++ b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/scripts/params.py
@@ -73,6 +73,8 @@ supervisor_port = "56431"
 storm_rest_api_port = "8745"
 falcon_port = config['configurations']['global']['falcon_port']
 ahs_port = get_port_from_url(config['configurations']['yarn-site']['yarn.timeline-service.webapp.address'])
+dfs_namenode_checkpoint_period = config['configurations']['hdfs-site']['dfs.namenode.checkpoint.period']
+dfs_namenode_checkpoint_txns = config['configurations']['hdfs-site']['dfs.namenode.checkpoint.txns']
 
 # this is different for HDP1
 nn_metrics_property = "FSNamesystem"
@@ -160,6 +162,7 @@ _falcon_host = default("/clusterHostInfo/falcon_server_hosts", None)
 _hbase_rs_hosts = default("/clusterHostInfo/hbase_rs_hosts", _slave_hosts)
 _hue_server_host = default("/clusterHostInfo/hue_server_host", None)
 all_hosts = config['clusterHostInfo']['all_hosts']
+nn_hosts_string = " ".join(namenode_host)
 
 
 hostgroup_defs = {

http://git-wip-us.apache.org/repos/asf/ambari/blob/f74e9503/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-commands.cfg.j2
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-commands.cfg.j2
b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-commands.cfg.j2
index da37d73..3d53b2b 100644
--- a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-commands.cfg.j2
+++ b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-commands.cfg.j2
@@ -135,4 +135,9 @@ define command{
   command_name check_tcp_wrapper
   command_line  $USER1$/check_wrapper.sh $USER1$/check_tcp -H $HOSTADDRESS$ -p $ARG1$ $ARG2$
 }
+
+define command{
+  command_name check_checkpoint_time
+  command_line python $USER1$/check_checkpoint_time.py -H "$ARG1$" -p $ARG2$ -w $ARG3$ -c
$ARG4$ -t $ARG5$ -x $ARG6$
+}
         

http://git-wip-us.apache.org/repos/asf/ambari/blob/f74e9503/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-services.cfg.j2
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-services.cfg.j2
b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-services.cfg.j2
index b77e77b..94fde2b 100644
--- a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-services.cfg.j2
+++ b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-services.cfg.j2
@@ -409,6 +409,17 @@ define service {
 {%  endfor  %}
 
 define service {
+        host_name               {{namenode_host[0]}}
+        use                     hadoop-service
+        service_description     NAMENODE::Last checkpoint time
+        servicegroups           HDFS
+        check_command           check_checkpoint_time!{{ nn_hosts_string }}!{{ namenode_port
}}!200!200!{{ dfs_namenode_checkpoint_period }}!{{dfs_namenode_checkpoint_txns}}
+        normal_check_interval   0.5
+        retry_check_interval    0.25
+        max_check_attempts      3
+}
+
+define service {
         hostgroup_name          nagios-server
         use                     hadoop-service
         service_description     HDFS::Blocks health

http://git-wip-us.apache.org/repos/asf/ambari/blob/f74e9503/ambari-server/src/test/python/stacks/2.0.6/NAGIOS/test_nagios_server.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/test/python/stacks/2.0.6/NAGIOS/test_nagios_server.py b/ambari-server/src/test/python/stacks/2.0.6/NAGIOS/test_nagios_server.py
index 0839995..75e1839 100644
--- a/ambari-server/src/test/python/stacks/2.0.6/NAGIOS/test_nagios_server.py
+++ b/ambari-server/src/test/python/stacks/2.0.6/NAGIOS/test_nagios_server.py
@@ -242,6 +242,11 @@ class TestNagiosServer(RMFTestCase):
                               content=StaticFile('hdp_nagios_init.php'),
                               mode=0755
     )
+    self.assertResourceCalled('File',
+                              '/usr/lib64/nagios/plugins/check_checkpoint_time.py',
+                              content=StaticFile('check_checkpoint_time.py'),
+                              mode=0755
+    )
     self.assertResourceCalled('Execute',
                               'htpasswd2 -c -b  /etc/nagios/htpasswd.users nagiosadmin \'!`"\'"\'"\'
1\'',
                               not_if="grep nagiosadmin /etc/nagios/htpasswd.users"


Mime
View raw message