ambari-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jonathanhur...@apache.org
Subject [1/2] ambari git commit: AMBARI-20467 - Add alerts for Livy in Spark and Spark 2 (Mingjie Tang via jonathanhurley)
Date Fri, 28 Apr 2017 12:55:17 GMT
Repository: ambari
Updated Branches:
  refs/heads/branch-2.5 fb9d4987c -> 0f8e1a665


AMBARI-20467 - Add alerts for Livy in Spark and Spark 2 (Mingjie Tang via jonathanhurley)


Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/9739c04e
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/9739c04e
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/9739c04e

Branch: refs/heads/branch-2.5
Commit: 9739c04e1e46572fc73d3b04f24b742f009a14c9
Parents: fb9d498
Author: Jonathan Hurley <jhurley@hortonworks.com>
Authored: Fri Apr 28 08:46:30 2017 -0400
Committer: Jonathan Hurley <jhurley@hortonworks.com>
Committed: Fri Apr 28 08:55:02 2017 -0400

----------------------------------------------------------------------
 .../common-services/SPARK/1.2.1/alerts.json     |  24 +++
 .../scripts/alerts/alert_spark_livy_port.py     | 146 +++++++++++++++++++
 .../common-services/SPARK2/2.0.0/alerts.json    |  24 +++
 .../scripts/alerts/alert_spark2_livy_port.py    | 146 +++++++++++++++++++
 4 files changed, 340 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ambari/blob/9739c04e/ambari-server/src/main/resources/common-services/SPARK/1.2.1/alerts.json
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/SPARK/1.2.1/alerts.json b/ambari-server/src/main/resources/common-services/SPARK/1.2.1/alerts.json
index 0e38f16..d3c1a59 100644
--- a/ambari-server/src/main/resources/common-services/SPARK/1.2.1/alerts.json
+++ b/ambari-server/src/main/resources/common-services/SPARK/1.2.1/alerts.json
@@ -27,6 +27,30 @@
           }
         }
       }
+    ],
+    "LIVY_SERVER": [
+      {
+        "name": "livy_server_status",
+        "label": "Spark Livy Server",
+        "description": "This host-level alert is triggered if the Livy Server cannot be determined
to be up.",
+        "interval": 1,
+        "scope": "HOST",
+        "source": {
+          "type": "SCRIPT",
+          "path": "SPARK/1.2.1/package/scripts/alerts/alert_spark_livy_port.py",
+          "parameters": [
+            {
+              "name": "check.command.timeout",
+              "display_name": "Command Timeout",
+              "value": 60.0,
+              "type": "NUMERIC",
+              "description": "The maximum time before check command will be killed by timeout",
+              "units": "seconds",
+              "threshold": "CRITICAL"
+            }
+          ]
+        }
+      }
     ]
   }
 }

http://git-wip-us.apache.org/repos/asf/ambari/blob/9739c04e/ambari-server/src/main/resources/common-services/SPARK/1.2.1/package/scripts/alerts/alert_spark_livy_port.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/SPARK/1.2.1/package/scripts/alerts/alert_spark_livy_port.py
b/ambari-server/src/main/resources/common-services/SPARK/1.2.1/package/scripts/alerts/alert_spark_livy_port.py
new file mode 100644
index 0000000..7396440
--- /dev/null
+++ b/ambari-server/src/main/resources/common-services/SPARK/1.2.1/package/scripts/alerts/alert_spark_livy_port.py
@@ -0,0 +1,146 @@
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import time
+import logging
+import traceback
+import socket
+from resource_management import *
+from resource_management.libraries.functions import format
+from ambari_commons.os_family_impl import OsFamilyFuncImpl, OsFamilyImpl
+from resource_management.libraries.script.script import Script
+from resource_management.core.resources import Execute
+from resource_management.core.logger import Logger
+from resource_management.core import global_lock
+from resource_management.libraries.functions import get_kinit_path
+
+
+OK_MESSAGE = "TCP OK - {0:.3f}s response on port {1}"
+CRITICAL_MESSAGE = "Connection failed on host {0}:{1} ({2})"
+
+logger = logging.getLogger('ambari_alerts')
+
+LIVY_SERVER_PORT_KEY = '{{livy-conf/livy.server.port}}'
+
+LIVYUSER_DEFAULT = 'livy'
+
+CHECK_COMMAND_TIMEOUT_KEY = 'check.command.timeout'
+CHECK_COMMAND_TIMEOUT_DEFAULT = 60.0
+
+SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}'
+SMOKEUSER_KEYTAB_KEY = '{{cluster-env/smokeuser_keytab}}'
+SMOKEUSER_PRINCIPAL_KEY = '{{cluster-env/smokeuser_principal_name}}'
+SMOKEUSER_KEY = '{{cluster-env/smokeuser}}'
+
+# The configured Kerberos executable search paths, if any
+KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY = '{{kerberos-env/executable_search_paths}}'
+
+
+@OsFamilyFuncImpl(os_family=OsFamilyImpl.DEFAULT)
+def get_tokens():
+    """
+    Returns a tuple of tokens in the format {{site/property}} that will be used
+    to build the dictionary passed into execute
+    """
+    return (LIVY_SERVER_PORT_KEY,LIVYUSER_DEFAULT,SECURITY_ENABLED_KEY,SMOKEUSER_KEYTAB_KEY,SMOKEUSER_PRINCIPAL_KEY,SMOKEUSER_KEY)
+
+@OsFamilyFuncImpl(os_family=OsFamilyImpl.DEFAULT)
+def execute(configurations={}, parameters={}, host_name=None):
+    """
+    Returns a tuple containing the result code and a pre-formatted result label
+
+    Keyword arguments:
+    configurations (dictionary): a mapping of configuration key to value
+    parameters (dictionary): a mapping of script parameter key to value
+    host_name (string): the name of this host where the alert is running
+    """
+
+    if configurations is None:
+        return ('UNKNOWN', ['There were no configurations supplied to the script.'])
+
+    LIVY_PORT_DEFAULT = 8998
+
+    port = LIVY_PORT_DEFAULT
+    if LIVY_SERVER_PORT_KEY in configurations:
+        port = int(configurations[LIVY_SERVER_PORT_KEY])
+
+    if host_name is None:
+        host_name = socket.getfqdn()
+
+    livyuser = LIVYUSER_DEFAULT
+
+    security_enabled = False
+    if SECURITY_ENABLED_KEY in configurations:
+        security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'
+
+    smokeuser_kerberos_keytab = None
+    if SMOKEUSER_KEYTAB_KEY in configurations:
+        smokeuser_kerberos_keytab = configurations[SMOKEUSER_KEYTAB_KEY]
+
+    if host_name is None:
+        host_name = socket.getfqdn()
+
+    smokeuser_principal = None
+    if SMOKEUSER_PRINCIPAL_KEY in configurations:
+        smokeuser_principal = configurations[SMOKEUSER_PRINCIPAL_KEY]
+        smokeuser_principal = smokeuser_principal.replace('_HOST',host_name.lower())
+
+    # Get the configured Kerberos executable search paths, if any
+    if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations:
+        kerberos_executable_search_paths = configurations[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY]
+     else:
+        kerberos_executable_search_paths = None
+
+    kinit_path_local = get_kinit_path(kerberos_executable_search_paths)
+
+    if security_enabled:
+        kinitcmd = format("{kinit_path_local} -kt {smokeuser_kerberos_keytab} {smokeuser_principal};
")
+        # prevent concurrent kinit
+        kinit_lock = global_lock.get_lock(global_lock.LOCK_TYPE_KERBEROS)
+        kinit_lock.acquire()
+        try:
+            Execute(kinitcmd, user=livyuser)
+        finally:
+            kinit_lock.release()
+
+    result_code = None
+    try:
+        start_time = time.time()
+        try:
+            livy_livyserver_host = str(host_name)
+
+            livy_cmd = format("curl -s -o /dev/null -w'%{{http_code}}' --negotiate -u: -k
http://{livy_livyserver_host}:{port}/sessions | grep 200 ")
+
+            Execute(livy_cmd,
+                    tries=3,
+                    try_sleep=1,
+                    logoutput=True,
+                    user=livyuser
+                    )
+
+            total_time = time.time() - start_time
+            result_code = 'OK'
+            label = OK_MESSAGE.format(total_time, port)
+        except:
+            result_code = 'CRITICAL'
+            label = CRITICAL_MESSAGE.format(host_name, port, traceback.format_exc())
+    except:
+        label = traceback.format_exc()
+        result_code = 'UNKNOWN'
+
+    return (result_code, [label])

http://git-wip-us.apache.org/repos/asf/ambari/blob/9739c04e/ambari-server/src/main/resources/common-services/SPARK2/2.0.0/alerts.json
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/SPARK2/2.0.0/alerts.json b/ambari-server/src/main/resources/common-services/SPARK2/2.0.0/alerts.json
index dc9d023..2e03f13 100755
--- a/ambari-server/src/main/resources/common-services/SPARK2/2.0.0/alerts.json
+++ b/ambari-server/src/main/resources/common-services/SPARK2/2.0.0/alerts.json
@@ -27,6 +27,30 @@
           }
         }
       }
+    ],
+    "LIVY2_SERVER": [
+      {
+        "name": "livy2_server_status",
+        "label": "Spark2 Livy Server",
+        "description": "This host-level alert is triggered if the Livy2 Server cannot be
determined to be up.",
+        "interval": 1,
+        "scope": "HOST",
+        "source": {
+          "type": "SCRIPT",
+          "path": "SPARK2/2.0.0/package/scripts/alerts/alert_spark2_livy_port.py",
+          "parameters": [
+            {
+              "name": "check.command.timeout",
+              "display_name": "Command Timeout",
+              "value": 60.0,
+              "type": "NUMERIC",
+              "description": "The maximum time before check command will be killed by timeout",
+              "units": "seconds",
+              "threshold": "CRITICAL"
+            }
+          ]
+        }
+      }
     ]
   }
 }

http://git-wip-us.apache.org/repos/asf/ambari/blob/9739c04e/ambari-server/src/main/resources/common-services/SPARK2/2.0.0/package/scripts/alerts/alert_spark2_livy_port.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/SPARK2/2.0.0/package/scripts/alerts/alert_spark2_livy_port.py
b/ambari-server/src/main/resources/common-services/SPARK2/2.0.0/package/scripts/alerts/alert_spark2_livy_port.py
new file mode 100644
index 0000000..44c284f
--- /dev/null
+++ b/ambari-server/src/main/resources/common-services/SPARK2/2.0.0/package/scripts/alerts/alert_spark2_livy_port.py
@@ -0,0 +1,146 @@
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import time
+import logging
+import traceback
+import socket
+from resource_management import *
+from resource_management.libraries.functions import format
+from ambari_commons.os_family_impl import OsFamilyFuncImpl, OsFamilyImpl
+from resource_management.libraries.script.script import Script
+from resource_management.core.resources import Execute
+from resource_management.core.logger import Logger
+from resource_management.core import global_lock
+from resource_management.libraries.functions import get_kinit_path
+
+
+OK_MESSAGE = "TCP OK - {0:.3f}s response on port {1}"
+CRITICAL_MESSAGE = "Connection failed on host {0}:{1} ({2})"
+
+logger = logging.getLogger('ambari_alerts')
+
+LIVY_SERVER_PORT_KEY = '{{livy2-conf/livy.server.port}}'
+
+LIVYUSER_DEFAULT = 'livy'
+
+CHECK_COMMAND_TIMEOUT_KEY = 'check.command.timeout'
+CHECK_COMMAND_TIMEOUT_DEFAULT = 60.0
+
+SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}'
+SMOKEUSER_KEYTAB_KEY = '{{cluster-env/smokeuser_keytab}}'
+SMOKEUSER_PRINCIPAL_KEY = '{{cluster-env/smokeuser_principal_name}}'
+SMOKEUSER_KEY = '{{cluster-env/smokeuser}}'
+
+# The configured Kerberos executable search paths, if any
+KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY = '{{kerberos-env/executable_search_paths}}'
+
+
+@OsFamilyFuncImpl(os_family=OsFamilyImpl.DEFAULT)
+def get_tokens():
+    """
+    Returns a tuple of tokens in the format {{site/property}} that will be used
+    to build the dictionary passed into execute
+    """
+    return (LIVY_SERVER_PORT_KEY,LIVYUSER_DEFAULT,SECURITY_ENABLED_KEY,SMOKEUSER_KEYTAB_KEY,SMOKEUSER_PRINCIPAL_KEY,SMOKEUSER_KEY)
+
+@OsFamilyFuncImpl(os_family=OsFamilyImpl.DEFAULT)
+def execute(configurations={}, parameters={}, host_name=None):
+    """
+    Returns a tuple containing the result code and a pre-formatted result label
+
+    Keyword arguments:
+    configurations (dictionary): a mapping of configuration key to value
+    parameters (dictionary): a mapping of script parameter key to value
+    host_name (string): the name of this host where the alert is running
+    """
+
+    if configurations is None:
+        return ('UNKNOWN', ['There were no configurations supplied to the script.'])
+
+    LIVY_PORT_DEFAULT = 8999
+
+    port = LIVY_PORT_DEFAULT
+    if LIVY_SERVER_PORT_KEY in configurations:
+        port = int(configurations[LIVY_SERVER_PORT_KEY])
+
+    if host_name is None:
+        host_name = socket.getfqdn()
+
+    livyuser = LIVYUSER_DEFAULT
+
+    security_enabled = False
+    if SECURITY_ENABLED_KEY in configurations:
+        security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'
+
+    smokeuser_kerberos_keytab = None
+    if SMOKEUSER_KEYTAB_KEY in configurations:
+        smokeuser_kerberos_keytab = configurations[SMOKEUSER_KEYTAB_KEY]
+
+    if host_name is None:
+        host_name = socket.getfqdn()
+
+    smokeuser_principal = None
+    if SMOKEUSER_PRINCIPAL_KEY in configurations:
+        smokeuser_principal = configurations[SMOKEUSER_PRINCIPAL_KEY]
+        smokeuser_principal = smokeuser_principal.replace('_HOST',host_name.lower())
+
+    # Get the configured Kerberos executable search paths, if any
+    if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations:
+        kerberos_executable_search_paths = configurations[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY]
+    else:
+        kerberos_executable_search_paths = None
+
+    kinit_path_local = get_kinit_path(kerberos_executable_search_paths)
+
+    if security_enabled:
+        kinitcmd = format("{kinit_path_local} -kt {smokeuser_kerberos_keytab} {smokeuser_principal};
")
+        # prevent concurrent kinit
+        kinit_lock = global_lock.get_lock(global_lock.LOCK_TYPE_KERBEROS)
+        kinit_lock.acquire()
+        try:
+            Execute(kinitcmd, user=livyuser)
+        finally:
+            kinit_lock.release()
+
+    result_code = None
+    try:
+        start_time = time.time()
+        try:
+            livy2_livyserver_host = str(host_name)
+
+            livy_cmd = format("curl -s -o /dev/null -w'%{{http_code}}' --negotiate -u: -k
http://{livy2_livyserver_host}:{port}/sessions | grep 200 ")
+
+            Execute(livy_cmd,
+                    tries=3,
+                    try_sleep=1,
+                    logoutput=True,
+                    user=livyuser
+                    )
+
+            total_time = time.time() - start_time
+            result_code = 'OK'
+            label = OK_MESSAGE.format(total_time, port)
+        except:
+            result_code = 'CRITICAL'
+            label = CRITICAL_MESSAGE.format(host_name, port, traceback.format_exc())
+    except:
+        label = traceback.format_exc()
+        result_code = 'UNKNOWN'
+
+    return (result_code, [label])


Mime
View raw message