ambari-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jonathanhur...@apache.org
Subject ambari git commit: AMBARI-12317 - False alert on "NameNode High Availability Health" due to Missing JMX objects (jonathanhurley)
Date Wed, 08 Jul 2015 14:34:43 GMT
Repository: ambari
Updated Branches:
  refs/heads/trunk d637d5f0b -> bebe23872


AMBARI-12317 - False alert on "NameNode High Availability Health" due to Missing JMX objects
(jonathanhurley)


Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/bebe2387
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/bebe2387
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/bebe2387

Branch: refs/heads/trunk
Commit: bebe23872db9cfbc666849fe589df4c7833f241e
Parents: d637d5f
Author: Jonathan Hurley <jhurley@hortonworks.com>
Authored: Tue Jul 7 13:49:56 2015 -0400
Committer: Jonathan Hurley <jhurley@hortonworks.com>
Committed: Wed Jul 8 10:33:52 2015 -0400

----------------------------------------------------------------------
 .../python/ambari_agent/alerts/metric_alert.py  | 15 ++--
 .../python/ambari_agent/alerts/web_alert.py     |  5 +-
 .../src/test/python/ambari_agent/TestAlerts.py  |  2 +-
 .../libraries/functions/curl_krb_request.py     | 20 +++--
 .../package/alerts/alert_checkpoint_time.py     | 14 ++-
 .../package/alerts/alert_ha_namenode_health.py  | 58 ++++++++++---
 .../package/alerts/alert_nodemanager_health.py  |  7 +-
 .../alerts/alert_nodemanagers_summary.py        | 13 ++-
 .../package/files/alert_ha_namenode_health.py   | 91 +++++++++++++++++---
 9 files changed, 176 insertions(+), 49 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ambari/blob/bebe2387/ambari-agent/src/main/python/ambari_agent/alerts/metric_alert.py
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/python/ambari_agent/alerts/metric_alert.py b/ambari-agent/src/main/python/ambari_agent/alerts/metric_alert.py
index ece6063..6bcdacd 100644
--- a/ambari-agent/src/main/python/ambari_agent/alerts/metric_alert.py
+++ b/ambari-agent/src/main/python/ambari_agent/alerts/metric_alert.py
@@ -58,8 +58,9 @@ class MetricAlert(BaseAlert):
       if 'connection_timeout' in uri:
         connection_timeout = uri['connection_timeout']
 
-    # python uses 5.0, not 5
+    # python uses 5.0, CURL uses "5"
     self.connection_timeout = float(connection_timeout)
+    self.curl_connection_timeout = int(connection_timeout)
 
     self.config = config
 
@@ -206,7 +207,9 @@ class MetricAlert(BaseAlert):
           smokeuser = self._get_configuration_value('{{cluster-env/smokeuser}}')
 
           response, error_msg, time_millis = curl_krb_request(tmp_dir, kerberos_keytab, kerberos_principal,
url,
-                                          "metric_alert", kerberos_executable_search_paths,
False, self.get_name(), smokeuser)
+            "metric_alert", kerberos_executable_search_paths, False, self.get_name(), smokeuser,
+            connection_timeout=self.curl_connection_timeout)
+
           content = response
         else:
           url_opener = urllib2.build_opener(RefreshHeaderProcessor())
@@ -242,10 +245,10 @@ class MetricAlert(BaseAlert):
           value_list.append(json_data[attr])
 
       http_response_code = None
-      if not json_is_valid and security_enabled and \
-            kerberos_principal is not None and kerberos_keytab is not None:
-        http_response_code, error_msg, time_millis = curl_krb_request(tmp_dir, kerberos_keytab,
kerberos_principal, url,
-                                              "metric_alert", kerberos_executable_search_paths,
True, self.get_name(), smokeuser)
+      if not json_is_valid and security_enabled and kerberos_principal is not None and kerberos_keytab
is not None:
+        http_response_code, error_msg, time_millis = curl_krb_request(tmp_dir, kerberos_keytab,
+          kerberos_principal, url, "metric_alert", kerberos_executable_search_paths, True,
+          self.get_name(), smokeuser, connection_timeout=self.curl_connection_timeout)
 
     return (value_list, http_response_code)
 

http://git-wip-us.apache.org/repos/asf/ambari/blob/bebe2387/ambari-agent/src/main/python/ambari_agent/alerts/web_alert.py
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/python/ambari_agent/alerts/web_alert.py b/ambari-agent/src/main/python/ambari_agent/alerts/web_alert.py
index 04d1b01..b76d5e0 100644
--- a/ambari-agent/src/main/python/ambari_agent/alerts/web_alert.py
+++ b/ambari-agent/src/main/python/ambari_agent/alerts/web_alert.py
@@ -68,7 +68,7 @@ class WebAlert(BaseAlert):
 
     # python uses 5.0, CURL uses "5"
     self.connection_timeout = float(connection_timeout)
-    self.curl_connection_timeout = str(int(connection_timeout))
+    self.curl_connection_timeout = int(connection_timeout)
 
     self.config = config
 
@@ -180,7 +180,8 @@ class WebAlert(BaseAlert):
         smokeuser = self._get_configuration_value('{{cluster-env/smokeuser}}')
 
         response_code, error_msg, time_millis = curl_krb_request(tmp_dir, kerberos_keytab,
kerberos_principal, url,
-                                               "web_alert", kerberos_executable_search_paths,
True, self.get_name(), smokeuser)
+          "web_alert", kerberos_executable_search_paths, True, self.get_name(), smokeuser,
+          connection_timeout=self.curl_connection_timeout)
       else:
         # kerberos is not involved; use urllib2
         response_code, time_millis, error_msg = self._make_web_request_urllib(url)

http://git-wip-us.apache.org/repos/asf/ambari/blob/bebe2387/ambari-agent/src/test/python/ambari_agent/TestAlerts.py
----------------------------------------------------------------------
diff --git a/ambari-agent/src/test/python/ambari_agent/TestAlerts.py b/ambari-agent/src/test/python/ambari_agent/TestAlerts.py
index 26f1617..7b64445 100644
--- a/ambari-agent/src/test/python/ambari_agent/TestAlerts.py
+++ b/ambari-agent/src/test/python/ambari_agent/TestAlerts.py
@@ -961,7 +961,7 @@ class TestAlerts(TestCase):
     definition_json = self._get_web_alert_definition()
     alert = WebAlert(definition_json, definition_json['source'], None)
     self.assertEquals(5.678, alert.connection_timeout)
-    self.assertEquals("5", alert.curl_connection_timeout)
+    self.assertEquals(5, alert.curl_connection_timeout)
 
     # the metric definition will not and should default to 5.0
     definition_json = self._get_metric_alert_definition()

http://git-wip-us.apache.org/repos/asf/ambari/blob/bebe2387/ambari-common/src/main/python/resource_management/libraries/functions/curl_krb_request.py
----------------------------------------------------------------------
diff --git a/ambari-common/src/main/python/resource_management/libraries/functions/curl_krb_request.py
b/ambari-common/src/main/python/resource_management/libraries/functions/curl_krb_request.py
index b7adac4..589ec1e 100644
--- a/ambari-common/src/main/python/resource_management/libraries/functions/curl_krb_request.py
+++ b/ambari-common/src/main/python/resource_management/libraries/functions/curl_krb_request.py
@@ -41,15 +41,18 @@ except ImportError:
   import md5
   _md5 = md5.new
 
-CONNECTION_TIMEOUT = 10
-MAX_TIMEOUT = 12
+CONNECTION_TIMEOUT_DEFAULT = 10
+MAX_TIMEOUT_DEFAULT = CONNECTION_TIMEOUT_DEFAULT + 2
 
 logger = logging.getLogger()
 
 
-def curl_krb_request(tmp_dir, keytab, principal, url, cache_file_prefix, krb_exec_search_paths,
-                     return_only_http_code, alert_name, user):
+def curl_krb_request(tmp_dir, keytab, principal, url, cache_file_prefix,
+    krb_exec_search_paths, return_only_http_code, alert_name, user,
+    connection_timeout = CONNECTION_TIMEOUT_DEFAULT):
+
   import uuid
+
   # Create the kerberos credentials cache (ccache) file and set it in the environment to
use
   # when executing curl. Use the md5 hash of the combination of the principal and keytab
file
   # to generate a (relatively) unique cache filename so that we can use it as needed.
@@ -88,15 +91,20 @@ def curl_krb_request(tmp_dir, keytab, principal, url, cache_file_prefix,
krb_exe
 
   start_time = time.time()
   error_msg = None
+
+  # setup timeouts for the request; ensure we use integers since that is what curl needs
+  connection_timeout = int(connection_timeout)
+  maximum_timeout = connection_timeout + 2
+
   try:
     if return_only_http_code:
       _, curl_stdout, curl_stderr = shell.checked_call(['curl', '-k', '--negotiate', '-u',
':', '-b', cookie_file, '-c', cookie_file, '-w',
-                             '%{http_code}', url, '--connect-timeout', str(CONNECTION_TIMEOUT),
'--max-time', str(MAX_TIMEOUT), '-o', '/dev/null'],
+                             '%{http_code}', url, '--connect-timeout', str(connection_timeout),
'--max-time', str(maximum_timeout), '-o', '/dev/null'],
                              stderr=subprocess.PIPE, env=kerberos_env, user=user)
     else:
       # returns response body
       _, curl_stdout, curl_stderr = shell.checked_call(['curl', '-k', '--negotiate', '-u',
':', '-b', cookie_file, '-c', cookie_file,
-                             url, '--connect-timeout', str(CONNECTION_TIMEOUT), '--max-time',
str(MAX_TIMEOUT)],
+                             url, '--connect-timeout', str(connection_timeout), '--max-time',
str(maximum_timeout)],
                              stderr=subprocess.PIPE, env=kerberos_env, user=user)
   except Fail:
     if logger.isEnabledFor(logging.DEBUG):

http://git-wip-us.apache.org/repos/asf/ambari/blob/bebe2387/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py
b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py
index 06e4c56..c1e0b54 100644
--- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py
@@ -147,15 +147,21 @@ def execute(configurations={}, parameters={}, host_name=None):
   try:
     if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
       env = Environment.get_instance()
+
+      # curl requires an integer timeout
+      curl_connection_timeout = int(connection_timeout)
+
       last_checkpoint_time_response, error_msg, time_millis = curl_krb_request(env.tmp_dir,
kerberos_keytab,
-                                    kerberos_principal, last_checkpoint_time_qry,"checkpoint_time_alert",
None, False,
-                                    "NameNode Last Checkpoint", smokeuser)
+        kerberos_principal, last_checkpoint_time_qry,"checkpoint_time_alert", None, False,
+        "NameNode Last Checkpoint", smokeuser, connection_timeout=curl_connection_timeout)
+
       last_checkpoint_time_response_json = json.loads(last_checkpoint_time_response)
       last_checkpoint_time = int(last_checkpoint_time_response_json["beans"][0]["LastCheckpointTime"])
 
       journal_transaction_info_response, error_msg, time_millis = curl_krb_request(env.tmp_dir,
kerberos_keytab,
-                                      kerberos_principal, journal_transaction_info_qry,"checkpoint_time_alert",
None,
-                                      False, "NameNode Last Checkpoint", smokeuser)
+        kerberos_principal, journal_transaction_info_qry,"checkpoint_time_alert", None,
+        False, "NameNode Last Checkpoint", smokeuser, connection_timeout=curl_connection_timeout)
+
       journal_transaction_info_response_json = json.loads(journal_transaction_info_response)
       journal_transaction_info = journal_transaction_info_response_json["beans"][0]["JournalTransactionInfo"]
     else:

http://git-wip-us.apache.org/repos/asf/ambari/blob/bebe2387/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py
b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py
index e09ec3a..a076825 100644
--- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py
@@ -116,11 +116,11 @@ def execute(configurations={}, parameters={}, host_name=None):
     return (RESULT_STATE_UNKNOWN, ['Unable to find unique namenode alias key {0}'.format(nn_unique_ids_key)])
 
   namenode_http_fragment = 'dfs.namenode.http-address.{0}.{1}'
-  jmx_uri_fragment = "http://{0}/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus"
+  jmx_uri_fragment = "http://{0}/jmx?qry=Hadoop:service=NameNode,name=*"
 
   if is_ssl_enabled:
     namenode_http_fragment = 'dfs.namenode.https-address.{0}.{1}'
-    jmx_uri_fragment = "https://{0}/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus"
+    jmx_uri_fragment = "https://{0}/jmx?qry=Hadoop:service=NameNode,name=*"
 
 
   active_namenodes = []
@@ -142,13 +142,18 @@ def execute(configurations={}, parameters={}, host_name=None):
         jmx_uri = jmx_uri_fragment.format(value)
         if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
           env = Environment.get_instance()
-          state_response, error_msg, time_millis  = curl_krb_request(env.tmp_dir, kerberos_keytab,
kerberos_principal,
-                                                    jmx_uri,"ha_nn_health", None, False,
-                                                    "NameNode High Availability Health",
smokeuser)
-          state_response_json = json.loads(state_response)
-          state = state_response_json["beans"][0]['State']
+
+          # curl requires an integer timeout
+          curl_connection_timeout = int(connection_timeout)
+
+          state_response, error_msg, time_millis  = curl_krb_request(env.tmp_dir,
+            kerberos_keytab, kerberos_principal, jmx_uri,"ha_nn_health", None, False,
+            "NameNode High Availability Health", smokeuser, connection_timeout=curl_connection_timeout)
+
+          state = _get_ha_state_from_json(state_response)
         else:
-          state = get_value_from_jmx(jmx_uri, 'State', connection_timeout)
+          state_response = get_jmx(jmx_uri, connection_timeout)
+          state = _get_ha_state_from_json(state_response)
 
         if state == HDFS_NN_STATE_ACTIVE:
           active_namenodes.append(value)
@@ -204,18 +209,45 @@ def execute(configurations={}, parameters={}, host_name=None):
       return (RESULT_STATE_SKIPPED, ['Another host will report this alert'])
 
 
-def get_value_from_jmx(query, jmx_property, connection_timeout):
+def get_jmx(query, connection_timeout):
   response = None
   
   try:
     response = urllib2.urlopen(query, timeout=connection_timeout)
-    data = response.read()
-
-    data_dict = json.loads(data)
-    return data_dict["beans"][0][jmx_property]
+    json_data = response.read()
+    return json_data
   finally:
     if response is not None:
       try:
         response.close()
       except:
         pass
+
+
+def _get_ha_state_from_json(string_json):
+  """
+  Searches through the specified JSON string looking for either the HDP 2.0 or 2.1+ HA state
+  enumerations.
+  :param string_json: the string JSON
+  :return:  the value of the HA state (active, standby, etc)
+  """
+  json_data = json.loads(string_json)
+  jmx_beans = json_data["beans"]
+
+  # look for HDP 2.1+ first
+  for jmx_bean in jmx_beans:
+    if "name" not in jmx_bean:
+      continue
+
+    jmx_bean_name = jmx_bean["name"]
+    if jmx_bean_name == "Hadoop:service=NameNode,name=NameNodeStatus" and "State" in jmx_bean:
+      return jmx_bean["State"]
+
+  # look for HDP 2.0 last
+  for jmx_bean in jmx_beans:
+    if "name" not in jmx_bean:
+      continue
+
+    jmx_bean_name = jmx_bean["name"]
+    if jmx_bean_name == "Hadoop:service=NameNode,name=FSNamesystem":
+      return jmx_bean["tag.HAState"]

http://git-wip-us.apache.org/repos/asf/ambari/blob/bebe2387/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py
b/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py
index a46ad93..05b6a2a 100644
--- a/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py
+++ b/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py
@@ -141,8 +141,13 @@ def execute(configurations={}, parameters={}, host_name=None):
   try:
     if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
       env = Environment.get_instance()
+
+      # curl requires an integer timeout
+      curl_connection_timeout = int(connection_timeout)
+
       url_response, error_msg, time_millis  = curl_krb_request(env.tmp_dir, kerberos_keytab,
kerberos_principal,
-                                                query, "nm_health_alert", None, False, "NodeManager
Health", smokeuser)
+        query, "nm_health_alert", None, False, "NodeManager Health", smokeuser,
+        connection_timeout=curl_connection_timeout)
 
       json_response = json.loads(url_response)
     else:

http://git-wip-us.apache.org/repos/asf/ambari/blob/bebe2387/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py
b/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py
index 390576d..845b616 100644
--- a/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py
+++ b/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py
@@ -115,9 +115,14 @@ def execute(configurations={}, parameters={}, host_name=None):
   try:
     if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
       env = Environment.get_instance()
+
+      # curl requires an integer timeout
+      curl_connection_timeout = int(connection_timeout)
+
       url_response, error_msg, time_millis  = curl_krb_request(env.tmp_dir, kerberos_keytab,
kerberos_principal,
-                                              live_nodemanagers_qry, "nm_health_summary_alert",
None, False,
-                                              "NodeManager Health Summary", smokeuser)
+        live_nodemanagers_qry, "nm_health_summary_alert", None, False,
+        "NodeManager Health Summary", smokeuser, connection_timeout=curl_connection_timeout)
+
       try:
         url_response_json = json.loads(url_response)
         live_nodemanagers = json.loads(url_response_json["beans"][0]["LiveNodeManagers"])
@@ -129,8 +134,8 @@ def execute(configurations={}, parameters={}, host_name=None):
 
       if convert_to_json_failed:
         response_code, error_msg, time_millis  = curl_krb_request(env.tmp_dir, kerberos_keytab,
kerberos_principal,
-                                                    live_nodemanagers_qry, "nm_health_summary_alert",
None, True,
-                                                    "NodeManager Health Summary", smokeuser)
+          live_nodemanagers_qry, "nm_health_summary_alert", None, True,
+          "NodeManager Health Summary", smokeuser, connection_timeout=curl_connection_timeout)
     else:
       live_nodemanagers = json.loads(get_value_from_jmx(live_nodemanagers_qry,
       "LiveNodeManagers", connection_timeout))

http://git-wip-us.apache.org/repos/asf/ambari/blob/bebe2387/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_ha_namenode_health.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_ha_namenode_health.py
b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_ha_namenode_health.py
index 19e3170..a076825 100644
--- a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_ha_namenode_health.py
+++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_ha_namenode_health.py
@@ -19,7 +19,11 @@ limitations under the License.
 """
 
 import urllib2
-import json
+import ambari_simplejson as json # simplejson is much faster comparing to Python 2.6 json
module and has the same functions set.
+import logging
+
+from resource_management.libraries.functions.curl_krb_request import curl_krb_request
+from resource_management.core.environment import Environment
 
 RESULT_STATE_OK = 'OK'
 RESULT_STATE_CRITICAL = 'CRITICAL'
@@ -35,17 +39,24 @@ NN_HTTP_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.http-address}}'
 NN_HTTPS_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.https-address}}'
 DFS_POLICY_KEY = '{{hdfs-site/dfs.http.policy}}'
 
+KERBEROS_KEYTAB = '{{hdfs-site/dfs.web.authentication.kerberos.keytab}}'
+KERBEROS_PRINCIPAL = '{{hdfs-site/dfs.web.authentication.kerberos.principal}}'
+SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}'
+SMOKEUSER_KEY = '{{cluster-env/smokeuser}}'
+
 CONNECTION_TIMEOUT_KEY = 'connection.timeout'
 CONNECTION_TIMEOUT_DEFAULT = 5.0
 
+logger = logging.getLogger()
+
 def get_tokens():
   """
   Returns a tuple of tokens in the format {{site/property}} that will be used
   to build the dictionary passed into execute
   """
   return (HDFS_SITE_KEY, NAMESERVICE_KEY, NN_HTTP_ADDRESS_KEY,
-  NN_HTTPS_ADDRESS_KEY, DFS_POLICY_KEY)
-
+  NN_HTTPS_ADDRESS_KEY, DFS_POLICY_KEY, SMOKEUSER_KEY, KERBEROS_KEYTAB, KERBEROS_PRINCIPAL,
SECURITY_ENABLED_KEY)
+  
 
 def execute(configurations={}, parameters={}, host_name=None):
   """
@@ -66,12 +77,28 @@ def execute(configurations={}, parameters={}, host_name=None):
   # hdfs-site is required
   if not HDFS_SITE_KEY in configurations:
     return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)])
+  
+  if SMOKEUSER_KEY in configurations:
+    smokeuser = configurations[SMOKEUSER_KEY]
 
   # parse script arguments
   connection_timeout = CONNECTION_TIMEOUT_DEFAULT
   if CONNECTION_TIMEOUT_KEY in parameters:
     connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])
 
+  security_enabled = False
+  if SECURITY_ENABLED_KEY in configurations:
+    security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'
+
+  kerberos_keytab = None
+  if KERBEROS_KEYTAB in configurations:
+    kerberos_keytab = configurations[KERBEROS_KEYTAB]
+
+  kerberos_principal = None
+  if KERBEROS_PRINCIPAL in configurations:
+    kerberos_principal = configurations[KERBEROS_PRINCIPAL]
+    kerberos_principal = kerberos_principal.replace('_HOST', host_name)
+
 
   # determine whether or not SSL is enabled
   is_ssl_enabled = False
@@ -89,11 +116,11 @@ def execute(configurations={}, parameters={}, host_name=None):
     return (RESULT_STATE_UNKNOWN, ['Unable to find unique namenode alias key {0}'.format(nn_unique_ids_key)])
 
   namenode_http_fragment = 'dfs.namenode.http-address.{0}.{1}'
-  jmx_uri_fragment = "http://{0}/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus"
+  jmx_uri_fragment = "http://{0}/jmx?qry=Hadoop:service=NameNode,name=*"
 
   if is_ssl_enabled:
     namenode_http_fragment = 'dfs.namenode.https-address.{0}.{1}'
-    jmx_uri_fragment = "https://{0}/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus"
+    jmx_uri_fragment = "https://{0}/jmx?qry=Hadoop:service=NameNode,name=*"
 
 
   active_namenodes = []
@@ -113,7 +140,20 @@ def execute(configurations={}, parameters={}, host_name=None):
 
       try:
         jmx_uri = jmx_uri_fragment.format(value)
-        state = get_value_from_jmx(jmx_uri, 'State', connection_timeout)
+        if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
+          env = Environment.get_instance()
+
+          # curl requires an integer timeout
+          curl_connection_timeout = int(connection_timeout)
+
+          state_response, error_msg, time_millis  = curl_krb_request(env.tmp_dir,
+            kerberos_keytab, kerberos_principal, jmx_uri,"ha_nn_health", None, False,
+            "NameNode High Availability Health", smokeuser, connection_timeout=curl_connection_timeout)
+
+          state = _get_ha_state_from_json(state_response)
+        else:
+          state_response = get_jmx(jmx_uri, connection_timeout)
+          state = _get_ha_state_from_json(state_response)
 
         if state == HDFS_NN_STATE_ACTIVE:
           active_namenodes.append(value)
@@ -169,18 +209,45 @@ def execute(configurations={}, parameters={}, host_name=None):
       return (RESULT_STATE_SKIPPED, ['Another host will report this alert'])
 
 
-def get_value_from_jmx(query, jmx_property, connection_timeout):
+def get_jmx(query, connection_timeout):
   response = None
-
+  
   try:
     response = urllib2.urlopen(query, timeout=connection_timeout)
-    data = response.read()
-
-    data_dict = json.loads(data)
-    return data_dict["beans"][0][jmx_property]
+    json_data = response.read()
+    return json_data
   finally:
     if response is not None:
       try:
         response.close()
       except:
         pass
+
+
+def _get_ha_state_from_json(string_json):
+  """
+  Searches through the specified JSON string looking for either the HDP 2.0 or 2.1+ HA state
+  enumerations.
+  :param string_json: the string JSON
+  :return:  the value of the HA state (active, standby, etc)
+  """
+  json_data = json.loads(string_json)
+  jmx_beans = json_data["beans"]
+
+  # look for HDP 2.1+ first
+  for jmx_bean in jmx_beans:
+    if "name" not in jmx_bean:
+      continue
+
+    jmx_bean_name = jmx_bean["name"]
+    if jmx_bean_name == "Hadoop:service=NameNode,name=NameNodeStatus" and "State" in jmx_bean:
+      return jmx_bean["State"]
+
+  # look for HDP 2.0 last
+  for jmx_bean in jmx_beans:
+    if "name" not in jmx_bean:
+      continue
+
+    jmx_bean_name = jmx_bean["name"]
+    if jmx_bean_name == "Hadoop:service=NameNode,name=FSNamesystem":
+      return jmx_bean["tag.HAState"]


Mime
View raw message