ambari-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From avija...@apache.org
Subject [1/2] ambari git commit: AMBARI-21106 : ML-Prototype: Detect timeseries anomaly for a metric. (Refine PIT & Trend subsystems, Integrate with AMS, Ambari Alerts.)
Date Tue, 26 Sep 2017 21:39:12 GMT
Repository: ambari
Updated Branches:
  refs/heads/branch-3.0-ams 63e743557 -> a11d1033d


http://git-wip-us.apache.org/repos/asf/ambari/blob/a11d1033/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/package/alerts/alert_point_in_time_metric_anomalies.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/package/alerts/alert_point_in_time_metric_anomalies.py
b/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/package/alerts/alert_point_in_time_metric_anomalies.py
new file mode 100644
index 0000000..154ce1c
--- /dev/null
+++ b/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/package/alerts/alert_point_in_time_metric_anomalies.py
@@ -0,0 +1,185 @@
+#!/usr/bin/env python
+
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import json
+import urllib
+import time
+import os
+import ambari_commons.network as network
+import logging
+
+from ambari_agent.AmbariConfig import AmbariConfig
+
+RESULT_STATE_OK = 'OK'
+RESULT_STATE_CRITICAL = 'CRITICAL'
+RESULT_STATE_WARNING = 'WARNING'
+RESULT_STATE_UNKNOWN = 'UNKNOWN'
+RESULT_STATE_SKIPPED = 'SKIPPED'
+
+AMS_HTTP_POLICY = '{{ams-site/timeline.metrics.service.http.policy}}'
+METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY = '{{ams-site/timeline.metrics.service.webapp.address}}'
+METRICS_COLLECTOR_VIP_HOST_KEY = '{{cluster-env/metrics_collector_vip_host}}'
+METRICS_COLLECTOR_VIP_PORT_KEY = '{{cluster-env/metrics_collector_vip_port}}'
+
+INTERVAL_PARAM_KEY = 'interval'
+INTERVAL_PARAM_DEFAULT = 10
+
+NUM_ANOMALIES_KEY = 'num_anomalies'
+NUM_ANOMALIES_DEFAULT = 5
+
+SENSITIVITY_KEY = 'sensitivity'
+SENSITIVITY_DEFAULT = 5
+
+AMS_METRICS_GET_URL = "/ws/v1/timeline/metrics/anomalies?%s"
+
+logger = logging.getLogger()
+
+def get_tokens():
+  """
+  Returns a tuple of tokens in the format {{site/property}} that will be used
+  to build the dictionary passed into execute
+  """
+  return (METRICS_COLLECTOR_VIP_HOST_KEY, METRICS_COLLECTOR_VIP_PORT_KEY,
+          METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY, AMS_HTTP_POLICY)
+
+
+def execute(configurations={}, parameters={}, host_name=None):
+  """
+  Returns a tuple containing the result code and a pre-formatted result label
+
+  Keyword arguments:
+  configurations (dictionary): a mapping of configuration key to value
+  parameters (dictionary): a mapping of script parameter key to value
+  host_name (string): the name of this host where the alert is running
+  """
+
+  """
+  Get ready with AMS GET url.
+  Query AMS for point in time anomalies in the last 30mins. 
+  Generate a message with anomalies.
+  """
+  if configurations is None:
+    return (RESULT_STATE_UNKNOWN, ['There were no configurations supplied to the script.'])
+
+  collector_host = host_name
+  current_time = int(time.time()) * 1000
+
+  interval = INTERVAL_PARAM_DEFAULT
+  if INTERVAL_PARAM_KEY in parameters:
+    interval = _coerce_to_integer(parameters[INTERVAL_PARAM_KEY])
+
+  num_anomalies = NUM_ANOMALIES_DEFAULT
+  if NUM_ANOMALIES_KEY in parameters:
+    num_anomalies = _coerce_to_integer(parameters[NUM_ANOMALIES_KEY])
+
+  sensitivity = SENSITIVITY_DEFAULT
+  if SENSITIVITY_KEY in parameters:
+    sensitivity = _coerce_to_integer(parameters[SENSITIVITY_KEY])
+
+  if METRICS_COLLECTOR_VIP_HOST_KEY in configurations and METRICS_COLLECTOR_VIP_PORT_KEY
in configurations:
+    collector_host = configurations[METRICS_COLLECTOR_VIP_HOST_KEY]
+    collector_port = int(configurations[METRICS_COLLECTOR_VIP_PORT_KEY])
+  else:
+    # ams-site/timeline.metrics.service.webapp.address is required
+    if not METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY in configurations:
+      return (RESULT_STATE_UNKNOWN,
+              ['{0} is a required parameter for the script'.format(METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY)])
+    else:
+      collector_webapp_address = configurations[METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY].split(":")
+      if valid_collector_webapp_address(collector_webapp_address):
+        collector_port = int(collector_webapp_address[1])
+      else:
+        return (RESULT_STATE_UNKNOWN, ['{0} value should be set as "fqdn_hostname:port",
but set to {1}'.format(
+          METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY, configurations[METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY])])
+
+  get_ema_anomalies_parameters = {
+    "method": "ema",
+    "startTime": current_time - interval * 60 * 1000,
+    "endTime": current_time,
+    "limit": num_anomalies + 1
+  }
+
+  encoded_get_metrics_parameters = urllib.urlencode(get_ema_anomalies_parameters)
+
+  ams_collector_conf_dir = "/etc/ambari-metrics-collector/conf"
+  metric_truststore_ca_certs = 'ca.pem'
+  ca_certs = os.path.join(ams_collector_conf_dir,
+                          metric_truststore_ca_certs)
+  metric_collector_https_enabled = str(configurations[AMS_HTTP_POLICY]) == "HTTPS_ONLY"
+
+  try:
+    conn = network.get_http_connection(
+      collector_host,
+      int(collector_port),
+      metric_collector_https_enabled,
+      ca_certs,
+      ssl_version=AmbariConfig.get_resolved_config().get_force_https_protocol_value()
+    )
+    conn.request("GET", AMS_METRICS_GET_URL % encoded_get_metrics_parameters)
+    response = conn.getresponse()
+    data = response.read()
+    logger.info("Data read from metric anomaly endpoint")
+    logger.info(data)
+    conn.close()
+  except Exception:
+    return (RESULT_STATE_UNKNOWN, ["Unable to retrieve anomaly metrics from the Ambari Metrics
service."])
+
+  if response.status != 200:
+    return (RESULT_STATE_UNKNOWN, ["Unable to retrieve anomaly metrics from the Ambari Metrics
service."])
+
+  data_json = json.loads(data)
+  length = len(data_json["metrics"])
+  logger.info("Number of anomalies returned : {0}".format(length))
+
+  if length == 0:
+    alert_state = RESULT_STATE_OK
+    alert_label = 'No point in time anomalies in the last {0} minutes.'.format(interval)
+    logger.info(alert_label)
+  elif length <= 5:
+    alert_state = RESULT_STATE_WARNING
+    alert_label = "http://avijayan-ad-1.openstacklocal:3000/dashboard/script/scripted.js?anomalies="
+ data
+  else:
+    alert_state = RESULT_STATE_CRITICAL
+    alert_label = "http://avijayan-ad-1.openstacklocal:3000/dashboard/script/scripted.js?anomalies="
+ data
+
+  return (alert_state, [alert_label])
+
+
+def valid_collector_webapp_address(webapp_address):
+  if len(webapp_address) == 2 \
+    and webapp_address[0] != '127.0.0.1' \
+    and webapp_address[1].isdigit():
+    return True
+
+  return False
+
+
+def _coerce_to_integer(value):
+  """
+  Attempts to correctly coerce a value to an integer. For the case of an integer or a float,
+  this will essentially either NOOP or return a truncated value. If the parameter is a string,
+  then it will first attempt to be coerced from a integer, and failing that, a float.
+  :param value: the value to coerce
+  :return: the coerced value as an integer
+  """
+  try:
+    return int(value)
+  except ValueError:
+    return int(float(value))

http://git-wip-us.apache.org/repos/asf/ambari/blob/a11d1033/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/package/alerts/alert_trend_metric_anomalies.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/package/alerts/alert_trend_metric_anomalies.py
b/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/package/alerts/alert_trend_metric_anomalies.py
new file mode 100644
index 0000000..8813d8e
--- /dev/null
+++ b/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/package/alerts/alert_trend_metric_anomalies.py
@@ -0,0 +1,185 @@
+#!/usr/bin/env python
+
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import json
+import urllib
+import time
+import os
+import ambari_commons.network as network
+import logging
+
+from ambari_agent.AmbariConfig import AmbariConfig
+
+RESULT_STATE_OK = 'OK'
+RESULT_STATE_CRITICAL = 'CRITICAL'
+RESULT_STATE_WARNING = 'WARNING'
+RESULT_STATE_UNKNOWN = 'UNKNOWN'
+RESULT_STATE_SKIPPED = 'SKIPPED'
+
+AMS_HTTP_POLICY = '{{ams-site/timeline.metrics.service.http.policy}}'
+METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY = '{{ams-site/timeline.metrics.service.webapp.address}}'
+METRICS_COLLECTOR_VIP_HOST_KEY = '{{cluster-env/metrics_collector_vip_host}}'
+METRICS_COLLECTOR_VIP_PORT_KEY = '{{cluster-env/metrics_collector_vip_port}}'
+
+INTERVAL_PARAM_KEY = 'interval'
+INTERVAL_PARAM_DEFAULT = 10
+
+NUM_ANOMALIES_KEY = 'num_anomalies'
+NUM_ANOMALIES_DEFAULT = 5
+
+SENSITIVITY_KEY = 'sensitivity'
+SENSITIVITY_DEFAULT = 5
+
+AMS_METRICS_GET_URL = "/ws/v1/timeline/metrics/anomalies?%s"
+
+logger = logging.getLogger()
+
+def get_tokens():
+  """
+  Returns a tuple of tokens in the format {{site/property}} that will be used
+  to build the dictionary passed into execute
+  """
+  return (METRICS_COLLECTOR_VIP_HOST_KEY, METRICS_COLLECTOR_VIP_PORT_KEY,
+          METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY, AMS_HTTP_POLICY)
+
+
+def execute(configurations={}, parameters={}, host_name=None):
+  """
+  Returns a tuple containing the result code and a pre-formatted result label
+
+  Keyword arguments:
+  configurations (dictionary): a mapping of configuration key to value
+  parameters (dictionary): a mapping of script parameter key to value
+  host_name (string): the name of this host where the alert is running
+  """
+
+  """
+  Get ready with AMS GET url.
+  Query AMS for point in time anomalies in the last 30mins. 
+  Generate a message with anomalies.
+  """
+  if configurations is None:
+    return (RESULT_STATE_UNKNOWN, ['There were no configurations supplied to the script.'])
+
+  collector_host = host_name
+  current_time = int(time.time()) * 1000
+
+  interval = INTERVAL_PARAM_DEFAULT
+  if INTERVAL_PARAM_KEY in parameters:
+    interval = _coerce_to_integer(parameters[INTERVAL_PARAM_KEY])
+
+  num_anomalies = NUM_ANOMALIES_DEFAULT
+  if NUM_ANOMALIES_KEY in parameters:
+    num_anomalies = _coerce_to_integer(parameters[NUM_ANOMALIES_KEY])
+
+  sensitivity = SENSITIVITY_DEFAULT
+  if SENSITIVITY_KEY in parameters:
+    sensitivity = _coerce_to_integer(parameters[SENSITIVITY_KEY])
+
+  if METRICS_COLLECTOR_VIP_HOST_KEY in configurations and METRICS_COLLECTOR_VIP_PORT_KEY
in configurations:
+    collector_host = configurations[METRICS_COLLECTOR_VIP_HOST_KEY]
+    collector_port = int(configurations[METRICS_COLLECTOR_VIP_PORT_KEY])
+  else:
+    # ams-site/timeline.metrics.service.webapp.address is required
+    if not METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY in configurations:
+      return (RESULT_STATE_UNKNOWN,
+              ['{0} is a required parameter for the script'.format(METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY)])
+    else:
+      collector_webapp_address = configurations[METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY].split(":")
+      if valid_collector_webapp_address(collector_webapp_address):
+        collector_port = int(collector_webapp_address[1])
+      else:
+        return (RESULT_STATE_UNKNOWN, ['{0} value should be set as "fqdn_hostname:port",
but set to {1}'.format(
+          METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY, configurations[METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY])])
+
+  get_ema_anomalies_parameters = {
+    "method": "ks",
+    "startTime": current_time - interval * 60 * 1000,
+    "endTime": current_time,
+    "limit": num_anomalies + 1
+  }
+
+  encoded_get_metrics_parameters = urllib.urlencode(get_ema_anomalies_parameters)
+
+  ams_collector_conf_dir = "/etc/ambari-metrics-collector/conf"
+  metric_truststore_ca_certs = 'ca.pem'
+  ca_certs = os.path.join(ams_collector_conf_dir,
+                          metric_truststore_ca_certs)
+  metric_collector_https_enabled = str(configurations[AMS_HTTP_POLICY]) == "HTTPS_ONLY"
+
+  try:
+    conn = network.get_http_connection(
+      collector_host,
+      int(collector_port),
+      metric_collector_https_enabled,
+      ca_certs,
+      ssl_version=AmbariConfig.get_resolved_config().get_force_https_protocol_value()
+    )
+    conn.request("GET", AMS_METRICS_GET_URL % encoded_get_metrics_parameters)
+    response = conn.getresponse()
+    data = response.read()
+    logger.info("Data read from metric anomaly endpoint")
+    logger.info(data)
+    conn.close()
+  except Exception:
+    return (RESULT_STATE_UNKNOWN, ["Unable to retrieve anomaly metrics from the Ambari Metrics
service."])
+
+  if response.status != 200:
+    return (RESULT_STATE_UNKNOWN, ["Unable to retrieve anomaly metrics from the Ambari Metrics
service."])
+
+  data_json = json.loads(data)
+  length = len(data_json["metrics"])
+  logger.info("Number of anomalies returned : {0}".format(length))
+
+  if length == 0:
+    alert_state = RESULT_STATE_OK
+    alert_label = 'No trend anomalies in the last {0} minutes.'.format(interval)
+    logger.info(alert_label)
+  elif length <= 5:
+    alert_state = RESULT_STATE_WARNING
+    alert_label = "http://avijayan-ad-1.openstacklocal:3000/dashboard/script/scripted.js?anomalies="
+ data
+  else:
+    alert_state = RESULT_STATE_CRITICAL
+    alert_label = "http://avijayan-ad-1.openstacklocal:3000/dashboard/script/scripted.js?anomalies="
+ data
+
+  return (alert_state, [alert_label])
+
+
+def valid_collector_webapp_address(webapp_address):
+  if len(webapp_address) == 2 \
+    and webapp_address[0] != '127.0.0.1' \
+    and webapp_address[1].isdigit():
+    return True
+
+  return False
+
+
+def _coerce_to_integer(value):
+  """
+  Attempts to correctly coerce a value to an integer. For the case of an integer or a float,
+  this will essentially either NOOP or return a truncated value. If the parameter is a string,
+  then it will first attempt to be coerced from a integer, and failing that, a float.
+  :param value: the value to coerce
+  :return: the coerced value as an integer
+  """
+  try:
+    return int(value)
+  except ValueError:
+    return int(float(value))

http://git-wip-us.apache.org/repos/asf/ambari/blob/a11d1033/ambari-server/src/main/resources/common-services/HDFS/3.0.0.3.0/package/alerts/alert_metrics_deviation.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HDFS/3.0.0.3.0/package/alerts/alert_metrics_deviation.py
b/ambari-server/src/main/resources/common-services/HDFS/3.0.0.3.0/package/alerts/alert_metrics_deviation.py
index bc2102a..7546de0 100644
--- a/ambari-server/src/main/resources/common-services/HDFS/3.0.0.3.0/package/alerts/alert_metrics_deviation.py
+++ b/ambari-server/src/main/resources/common-services/HDFS/3.0.0.3.0/package/alerts/alert_metrics_deviation.py
@@ -325,10 +325,12 @@ def execute(configurations={}, parameters={}, host_name=None):
     response = conn.getresponse()
     data = response.read()
     conn.close()
-  except Exception:
+  except Exception, e:
+    logger.info(str(e))
     return (RESULT_STATE_UNKNOWN, ["Unable to retrieve metrics from the Ambari Metrics service."])
 
   if response.status != 200:
+    logger.info(str(data))
     return (RESULT_STATE_UNKNOWN, ["Unable to retrieve metrics from the Ambari Metrics service."])
 
   data_json = json.loads(data)

http://git-wip-us.apache.org/repos/asf/ambari/blob/a11d1033/ambari-server/src/test/java/org/apache/ambari/server/controller/metrics/timeline/MetricsPaddingMethodTest.java
----------------------------------------------------------------------
diff --git a/ambari-server/src/test/java/org/apache/ambari/server/controller/metrics/timeline/MetricsPaddingMethodTest.java
b/ambari-server/src/test/java/org/apache/ambari/server/controller/metrics/timeline/MetricsPaddingMethodTest.java
index 785b36b..6bd2f4b 100644
--- a/ambari-server/src/test/java/org/apache/ambari/server/controller/metrics/timeline/MetricsPaddingMethodTest.java
+++ b/ambari-server/src/test/java/org/apache/ambari/server/controller/metrics/timeline/MetricsPaddingMethodTest.java
@@ -39,6 +39,7 @@ public class MetricsPaddingMethodTest {
     timelineMetric.setMetricName("m1");
     timelineMetric.setHostName("h1");
     timelineMetric.setAppId("a1");
+    timelineMetric.setStartTime(now);
     TreeMap<Long, Double> inputValues = new TreeMap<>();
     inputValues.put(now - 1000, 1.0d);
     inputValues.put(now - 2000, 2.0d);
@@ -66,6 +67,7 @@ public class MetricsPaddingMethodTest {
     timelineMetric.setMetricName("m1");
     timelineMetric.setHostName("h1");
     timelineMetric.setAppId("a1");
+    timelineMetric.setStartTime(now);
     TreeMap<Long, Double> inputValues = new TreeMap<>();
     inputValues.put(now - 1000, 1.0d);
     inputValues.put(now - 2000, 2.0d);
@@ -93,6 +95,7 @@ public class MetricsPaddingMethodTest {
     timelineMetric.setMetricName("m1");
     timelineMetric.setHostName("h1");
     timelineMetric.setAppId("a1");
+    timelineMetric.setStartTime(now);
     TreeMap<Long, Double> inputValues = new TreeMap<>();
     inputValues.put(now, 0.0d);
     inputValues.put(now - 1000, 1.0d);
@@ -120,6 +123,7 @@ public class MetricsPaddingMethodTest {
     timelineMetric.setMetricName("m1");
     timelineMetric.setHostName("h1");
     timelineMetric.setAppId("a1");
+    timelineMetric.setStartTime(now);
     TreeMap<Long, Double> inputValues = new TreeMap<>();
     inputValues.put(now - 1000, 1.0d);
     timelineMetric.setMetricValues(inputValues);
@@ -145,6 +149,7 @@ public class MetricsPaddingMethodTest {
     timelineMetric.setMetricName("m1");
     timelineMetric.setHostName("h1");
     timelineMetric.setAppId("a1");
+    timelineMetric.setStartTime(now);
     TreeMap<Long, Double> inputValues = new TreeMap<>();
     inputValues.put(now - 1000, 1.0d);
     timelineMetric.setMetricValues(inputValues);
@@ -168,6 +173,7 @@ public class MetricsPaddingMethodTest {
     timelineMetric.setMetricName("m1");
     timelineMetric.setHostName("h1");
     timelineMetric.setAppId("a1");
+    timelineMetric.setStartTime(now);
     TreeMap<Long, Double> inputValues = new TreeMap<>();
 
     long seconds = 1000;
@@ -228,6 +234,7 @@ public class MetricsPaddingMethodTest {
     timelineMetric.setMetricName("m1");
     timelineMetric.setHostName("h1");
     timelineMetric.setAppId("a1");
+    timelineMetric.setStartTime(now);
     TreeMap<Long, Double> inputValues = new TreeMap<>();
     inputValues.put(now - 100, 1.0d);
     inputValues.put(now - 200, 2.0d);


Mime
View raw message