Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id B5188200CF2 for ; Tue, 8 Aug 2017 11:12:15 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id B3648166F54; Tue, 8 Aug 2017 09:12:15 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 6481D166F53 for ; Tue, 8 Aug 2017 11:12:13 +0200 (CEST) Received: (qmail 48939 invoked by uid 500); 8 Aug 2017 09:12:12 -0000 Mailing-List: contact commits-help@ambari.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: ambari-dev@ambari.apache.org Delivered-To: mailing list commits@ambari.apache.org Received: (qmail 48930 invoked by uid 99); 8 Aug 2017 09:12:12 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 08 Aug 2017 09:12:12 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 2FF60F323C; Tue, 8 Aug 2017 09:12:10 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: aonishuk@apache.org To: commits@ambari.apache.org Date: Tue, 08 Aug 2017 09:12:10 -0000 Message-Id: <87355edacc754d6aad9280fa71f51980@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [1/2] ambari git commit: AMBARI-21667. Create a topic to send alert_definitions (aonishuk) archived-at: Tue, 08 Aug 2017 09:12:15 -0000 Repository: ambari Updated Branches: refs/heads/branch-3.0-perf 44c1cb512 -> 6578b5a28 http://git-wip-us.apache.org/repos/asf/ambari/blob/6578b5a2/ambari-agent/src/test/python/ambari_agent/dummy_files/stomp/alert_definitions.json ---------------------------------------------------------------------- diff --git a/ambari-agent/src/test/python/ambari_agent/dummy_files/stomp/alert_definitions.json b/ambari-agent/src/test/python/ambari_agent/dummy_files/stomp/alert_definitions.json new file mode 100644 index 0000000..cc21244 --- /dev/null +++ b/ambari-agent/src/test/python/ambari_agent/dummy_files/stomp/alert_definitions.json @@ -0,0 +1,2700 @@ +{ + "hash": "37fe2bd73438980c619c2b8c2f95d160", + "clusters": { + "0": { + "hash": "8f7b4e960133bc691661cbcdaddddec8", + "clusterName": "cl1", + "hostName": "ctr-e134-1499953498516-81665-01-000008.hwx.site", + "publicHostName": "ctr-e134-1499953498516-81665-01-000008.hwx.site", + "alertDefinitions": [{ + "ignore_host": false, + "name": "hbase_master_process", + "componentName": "HBASE_MASTER", + "interval": 1, + "clusterId": 2, + "uuid": "ff73ead7-13b4-43ea-a747-d230f17bf230", + "label": "HBase Master Process", + "definitionId": 1, + "source": { + "reporting": { + "warning": { + "text": "TCP OK - {0:.3f}s response on port {1}", + "value": 1.5 + }, + "ok": { + "text": "TCP OK - {0:.3f}s response on port {1}" + }, + "critical": { + "text": "Connection failed: {0} to {1}:{2}", + "value": 5.0 + } + }, + "type": "PORT", + "uri": "{{hbase-site/hbase.master.port}}", + "default_port": 60000 + }, + "serviceName": "HBASE", + "scope": "ANY", + "enabled": true, + "description": "This alert is triggered if the HBase master processes cannot be confirmed to be up and listening on the network for the configured critical threshold, given in seconds." + }, + { + "ignore_host": false, + "name": "hbase_master_cpu", + "componentName": "HBASE_MASTER", + "interval": 5, + "clusterId": 2, + "uuid": "6c891177-b32f-47c8-befb-3846049f98e8", + "label": "HBase Master CPU Utilization", + "definitionId": 2, + "source": { + "jmx": { + "value": "{0} * 100", + "property_list": [ + "java.lang:type=OperatingSystem/SystemCpuLoad", + "java.lang:type=OperatingSystem/AvailableProcessors" + ] + }, + "reporting": { + "units": "%", + "type": "PERCENT", + "warning": { + "text": "{1} CPU, load {0:.1%}", + "value": 200.0 + }, + "ok": { + "text": "{1} CPU, load {0:.1%}" + }, + "critical": { + "text": "{1} CPU, load {0:.1%}", + "value": 250.0 + } + }, + "type": "METRIC", + "uri": { + "connection_timeout": 5.0, + "default_port": 60010, + "http": "{{hbase-site/hbase.master.info.port}}", + "kerberos_principal": "{{hbase-site/hbase.security.authentication.spnego.kerberos.principal}}", + "kerberos_keytab": "{{hbase-site/hbase.security.authentication.spnego.kerberos.keytab}}" + } + }, + "serviceName": "HBASE", + "scope": "ANY", + "enabled": true, + "description": "This host-level alert is triggered if CPU utilization of the HBase Master exceeds certain warning and critical thresholds. It checks the HBase Master JMX Servlet for the SystemCPULoad property. The threshold values are in percent." + }, + { + "ignore_host": false, + "name": "hbase_regionserver_process_percent", + "enabled": true, + "interval": 1, + "clusterId": 2, + "uuid": "69ff4c8f-8e98-4cfd-b90f-6914e2f147ff", + "label": "Percent RegionServers Available", + "definitionId": 3, + "source": { + "alert_name": "hbase_regionserver_process", + "reporting": { + "units": "%", + "type": "PERCENT", + "warning": { + "text": "affected: [{1}], total: [{0}]", + "value": 10.0 + }, + "ok": { + "text": "affected: [{1}], total: [{0}]" + }, + "critical": { + "text": "affected: [{1}], total: [{0}]", + "value": 30.0 + } + }, + "type": "AGGREGATE" + }, + "serviceName": "HBASE", + "scope": "SERVICE", + "description": "This service-level alert is triggered if the configured percentage of RegionServer processes cannot be determined to be up and listening on the network for the configured warning and critical thresholds. It aggregates the results of RegionServer process down checks." + }, + { + "ignore_host": false, + "name": "yarn_nodemanager_webui_percent", + "enabled": true, + "interval": 1, + "clusterId": 2, + "uuid": "35ec3949-9cf6-4ef2-86f7-996e9bb15ced", + "label": "Percent NodeManagers Available", + "definitionId": 6, + "source": { + "alert_name": "yarn_nodemanager_webui", + "reporting": { + "units": "%", + "type": "PERCENT", + "warning": { + "text": "affected: [{1}], total: [{0}]", + "value": 10.0 + }, + "ok": { + "text": "affected: [{1}], total: [{0}]" + }, + "critical": { + "text": "affected: [{1}], total: [{0}]", + "value": 30.0 + } + }, + "type": "AGGREGATE" + }, + "serviceName": "YARN", + "scope": "SERVICE", + "description": "This alert is triggered if the number of down NodeManagers in the cluster is greater than the configured critical threshold. It aggregates the results of NodeManager process checks." + }, + { + "ignore_host": false, + "name": "yarn_resourcemanager_webui", + "componentName": "RESOURCEMANAGER", + "interval": 1, + "clusterId": 2, + "uuid": "8313d813-4a75-45ec-ad01-c6c7841d9b2d", + "label": "ResourceManager Web UI", + "definitionId": 8, + "source": { + "reporting": { + "warning": { + "text": "HTTP {0} response from {1} in {2:.3f}s ({3})" + }, + "ok": { + "text": "HTTP {0} response in {2:.3f}s" + }, + "critical": { + "text": "Connection failed to {1} ({3})" + } + }, + "type": "WEB", + "uri": { + "http": "{{yarn-site/yarn.resourcemanager.webapp.address}}", + "https_property_value": "HTTPS_ONLY", + "kerberos_keytab": "{{yarn-site/yarn.resourcemanager.webapp.spnego-keytab-file}}", + "https_property": "{{yarn-site/yarn.http.policy}}", + "https": "{{yarn-site/yarn.resourcemanager.webapp.https.address}}", + "default_port": 0, + "connection_timeout": 5.0, + "kerberos_principal": "{{yarn-site/yarn.resourcemanager.webapp.spnego-principal}}", + "high_availability": { + "alias_key": "{{yarn-site/yarn.resourcemanager.ha.rm-ids}}", + "https_pattern": "{{yarn-site/yarn.resourcemanager.webapp.https.address.{{alias}}}}", + "http_pattern": "{{yarn-site/yarn.resourcemanager.webapp.address.{{alias}}}}" + } + } + }, + "serviceName": "YARN", + "scope": "ANY", + "enabled": true, + "description": "This host-level alert is triggered if the ResourceManager Web UI is unreachable." + }, + { + "ignore_host": false, + "name": "yarn_resourcemanager_cpu", + "componentName": "RESOURCEMANAGER", + "interval": 5, + "clusterId": 2, + "uuid": "99bf5ce1-ce97-48ed-803b-72d5f1bbe41b", + "label": "ResourceManager CPU Utilization", + "definitionId": 9, + "source": { + "jmx": { + "value": "{0} * 100", + "property_list": [ + "java.lang:type=OperatingSystem/SystemCpuLoad", + "java.lang:type=OperatingSystem/AvailableProcessors" + ] + }, + "reporting": { + "units": "%", + "type": "PERCENT", + "warning": { + "text": "{1} CPU, load {0:.1%}", + "value": 200.0 + }, + "ok": { + "text": "{1} CPU, load {0:.1%}" + }, + "critical": { + "text": "{1} CPU, load {0:.1%}", + "value": 250.0 + } + }, + "type": "METRIC", + "uri": { + "http": "{{yarn-site/yarn.resourcemanager.webapp.address}}", + "https_property_value": "HTTPS_ONLY", + "kerberos_keytab": "{{yarn-site/yarn.resourcemanager.webapp.spnego-keytab-file}}", + "https_property": "{{yarn-site/yarn.http.policy}}", + "https": "{{yarn-site/yarn.resourcemanager.webapp.https.address}}", + "default_port": 0, + "connection_timeout": 5.0, + "kerberos_principal": "{{yarn-site/yarn.resourcemanager.webapp.spnego-principal}}", + "high_availability": { + "alias_key": "{{yarn-site/yarn.resourcemanager.ha.rm-ids}}", + "https_pattern": "{{yarn-site/yarn.resourcemanager.webapp.https.address.{{alias}}}}", + "http_pattern": "{{yarn-site/yarn.resourcemanager.webapp.address.{{alias}}}}" + } + } + }, + "serviceName": "YARN", + "scope": "ANY", + "enabled": true, + "description": "This host-level alert is triggered if CPU utilization of the ResourceManager exceeds certain warning and critical thresholds. It checks the ResourceManager JMX Servlet for the SystemCPULoad property. The threshold values are in percent." + }, + { + "ignore_host": false, + "name": "yarn_resourcemanager_rpc_latency", + "componentName": "RESOURCEMANAGER", + "interval": 5, + "clusterId": 2, + "uuid": "3a0cb326-f8d3-42ec-b527-01a6c597b5aa", + "label": "ResourceManager RPC Latency", + "definitionId": 11, + "source": { + "jmx": { + "value": "{0}", + "property_list": [ + "Hadoop:service=ResourceManager,name=RpcActivityForPort*/RpcQueueTimeAvgTime", + "Hadoop:service=ResourceManager,name=RpcActivityForPort*/RpcProcessingTimeAvgTime" + ] + }, + "reporting": { + "units": "ms", + "warning": { + "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]", + "value": 3000.0 + }, + "ok": { + "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]" + }, + "critical": { + "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]", + "value": 5000.0 + } + }, + "type": "METRIC", + "uri": { + "http": "{{yarn-site/yarn.resourcemanager.webapp.address}}", + "https_property_value": "HTTPS_ONLY", + "kerberos_keytab": "{{yarn-site/yarn.resourcemanager.webapp.spnego-keytab-file}}", + "https_property": "{{yarn-site/yarn.http.policy}}", + "https": "{{yarn-site/yarn.resourcemanager.webapp.https.address}}", + "default_port": 0, + "connection_timeout": 5.0, + "kerberos_principal": "{{yarn-site/yarn.resourcemanager.webapp.spnego-principal}}", + "high_availability": { + "alias_key": "{{yarn-site/yarn.resourcemanager.ha.rm-ids}}", + "https_pattern": "{{yarn-site/yarn.resourcemanager.webapp.https.address.{{alias}}}}", + "http_pattern": "{{yarn-site/yarn.resourcemanager.webapp.address.{{alias}}}}" + } + } + }, + "serviceName": "YARN", + "scope": "ANY", + "enabled": true, + "description": "This host-level alert is triggered if the ResourceManager operations RPC latency exceeds the configured critical threshold. Typically an increase in the RPC processing time increases the RPC queue length, causing the average queue wait time to increase for ResourceManager operations. The threshold values are in milliseconds." + }, + { + "ignore_host": false, + "name": "nodemanager_health_summary", + "componentName": "RESOURCEMANAGER", + "interval": 1, + "clusterId": 2, + "uuid": "0315ba12-ada0-4004-a4b4-f174490e4b3c", + "label": "NodeManager Health Summary", + "definitionId": 12, + "source": { + "path": "YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py", + "type": "SCRIPT", + "parameters": [{ + "display_name": "Connection Timeout", + "name": "connection.timeout", + "value": 5.0, + "threshold": "CRITICAL", + "units": "seconds", + "type": "NUMERIC", + "description": "The maximum time before this alert is considered to be CRITICAL" + }] + }, + "serviceName": "YARN", + "scope": "SERVICE", + "enabled": true, + "description": "This service-level alert is triggered if there are unhealthy NodeManagers" + }, + { + "ignore_host": false, + "name": "namenode_cpu", + "componentName": "NAMENODE", + "interval": 5, + "clusterId": 2, + "uuid": "3b5e6dd2-115c-4340-8c0e-c33baeb4313b", + "label": "NameNode Host CPU Utilization", + "definitionId": 20, + "source": { + "jmx": { + "value": "{0} * 100", + "property_list": [ + "java.lang:type=OperatingSystem/SystemCpuLoad", + "java.lang:type=OperatingSystem/AvailableProcessors" + ] + }, + "reporting": { + "units": "%", + "type": "PERCENT", + "warning": { + "text": "{1} CPU, load {0:.1%}", + "value": 200.0 + }, + "ok": { + "text": "{1} CPU, load {0:.1%}" + }, + "critical": { + "text": "{1} CPU, load {0:.1%}", + "value": 250.0 + } + }, + "type": "METRIC", + "uri": { + "http": "{{hdfs-site/dfs.namenode.http-address}}", + "https_property_value": "HTTPS_ONLY", + "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}", + "https_property": "{{hdfs-site/dfs.http.policy}}", + "https": "{{hdfs-site/dfs.namenode.https-address}}", + "default_port": 0, + "connection_timeout": 5.0, + "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}", + "high_availability": { + "nameservice": "{{hdfs-site/dfs.internal.nameservices}}", + "http_pattern": "{{hdfs-site/dfs.namenode.http-address.{{ha-nameservice}}.{{alias}}}}", + "https_pattern": "{{hdfs-site/dfs.namenode.https-address.{{ha-nameservice}}.{{alias}}}}", + "alias_key": "{{hdfs-site/dfs.ha.namenodes.{{ha-nameservice}}}}" + } + } + }, + "serviceName": "HDFS", + "scope": "ANY", + "enabled": true, + "description": "This host-level alert is triggered if CPU utilization of the NameNode exceeds certain warning and critical thresholds. It checks the NameNode JMX Servlet for the SystemCPULoad property. The threshold values are in percent." + }, + { + "ignore_host": false, + "name": "namenode_hdfs_pending_deletion_blocks", + "componentName": "NAMENODE", + "interval": 2, + "clusterId": 2, + "uuid": "416030cb-e996-4de9-b985-457f6bd5ac72", + "label": "HDFS Pending Deletion Blocks", + "definitionId": 22, + "source": { + "jmx": { + "value": "{0}", + "property_list": [ + "Hadoop:service=NameNode,name=FSNamesystem/PendingDeletionBlocks" + ] + }, + "reporting": { + "units": "Blocks", + "warning": { + "text": "Pending Deletion Blocks:[{0}]", + "value": 100000.0 + }, + "ok": { + "text": "Pending Deletion Blocks:[{0}]" + }, + "critical": { + "text": "Pending Deletion Blocks:[{0}]", + "value": 100000.0 + } + }, + "type": "METRIC", + "uri": { + "http": "{{hdfs-site/dfs.namenode.http-address}}", + "https_property_value": "HTTPS_ONLY", + "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}", + "https_property": "{{hdfs-site/dfs.http.policy}}", + "https": "{{hdfs-site/dfs.namenode.https-address}}", + "default_port": 0, + "connection_timeout": 5.0, + "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}", + "high_availability": { + "nameservice": "{{hdfs-site/dfs.internal.nameservices}}", + "http_pattern": "{{hdfs-site/dfs.namenode.http-address.{{ha-nameservice}}.{{alias}}}}", + "https_pattern": "{{hdfs-site/dfs.namenode.https-address.{{ha-nameservice}}.{{alias}}}}", + "alias_key": "{{hdfs-site/dfs.ha.namenodes.{{ha-nameservice}}}}" + } + } + }, + "serviceName": "HDFS", + "scope": "ANY", + "enabled": true, + "description": "This service-level alert is triggered if the number of blocks pending deletion in HDFS exceeds the configured warning and critical thresholds. It checks the NameNode JMX Servlet for the PendingDeletionBlock property." + }, + { + "ignore_host": false, + "name": "namenode_client_rpc_queue_latency_daily", + "componentName": "NAMENODE", + "interval": 480, + "clusterId": 2, + "uuid": "d2a84ae6-53b1-4eb6-b573-f26220a1cc4f", + "label": "NameNode Client RPC Queue Latency (Daily)", + "definitionId": 23, + "source": { + "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py", + "type": "SCRIPT", + "parameters": [{ + "display_name": "Whether active and stanby NameNodes metrics should be merged", + "name": "mergeHaMetrics", + "visibility": "HIDDEN", + "value": "false", + "type": "STRING", + "description": "Whether active and stanby NameNodes metrics should be merged." + }, + { + "display_name": "Time interval in minutes", + "name": "interval", + "visibility": "HIDDEN", + "value": 1440.0, + "type": "NUMERIC", + "description": "Time interval in minutes." + }, + { + "display_name": "AMS application id", + "name": "appId", + "visibility": "HIDDEN", + "value": "NAMENODE", + "type": "STRING", + "description": "The application id used to retrieve the metric." + }, + { + "display_name": "Metric Name", + "name": "metricName", + "visibility": "HIDDEN", + "value": "rpc.rpc.client.RpcQueueTimeAvgTime", + "type": "STRING", + "description": "The metric to monitor." + }, + { + "display_name": "Growth Rate", + "name": "metric.deviation.warning.threshold", + "value": 100.0, + "threshold": "WARNING", + "units": "%", + "type": "PERCENT", + "description": "The percentage of RPC queue latency growth." + }, + { + "display_name": "Growth Rate", + "name": "metric.deviation.critical.threshold", + "value": 200.0, + "threshold": "CRITICAL", + "units": "%", + "type": "PERCENT", + "description": "The percentage of RPC queue latency growth." + }, + { + "display_name": "Minimum Latency", + "name": "minimumValue", + "value": 30.0, + "units": "seconds", + "type": "NUMERIC", + "description": "The minimum latency to measure growth." + }, + { + "display_name": "Metric Units", + "name": "metric.units", + "visibility": "HIDDEN", + "value": "ms", + "type": "STRING", + "description": "The units that the metric data points are reported in." + } + ] + }, + "serviceName": "HDFS", + "scope": "ANY", + "enabled": true, + "description": "This service-level alert is triggered if the deviation of RPC latency on client port has grown beyond the specified threshold within a day period." + }, + { + "ignore_host": true, + "name": "namenode_ha_health", + "componentName": "NAMENODE", + "interval": 1, + "clusterId": 2, + "uuid": "990b0d19-f7bf-45a3-ad45-d55fb4814bdd", + "label": "NameNode High Availability Health", + "definitionId": 24, + "source": { + "path": "HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py", + "type": "SCRIPT", + "parameters": [{ + "display_name": "Connection Timeout", + "name": "connection.timeout", + "value": 5.0, + "threshold": "CRITICAL", + "units": "seconds", + "type": "NUMERIC", + "description": "The maximum time before this alert is considered to be CRITICAL" + }] + }, + "serviceName": "HDFS", + "scope": "ANY", + "enabled": true, + "description": "This service-level alert is triggered if either the Active NameNode or Standby NameNode are not running." + }, + { + "ignore_host": false, + "name": "datanode_health_summary", + "componentName": "NAMENODE", + "interval": 1, + "clusterId": 2, + "uuid": "9550f814-70ce-4ad2-8dea-569fbc3e8636", + "label": "DataNode Health Summary", + "definitionId": 26, + "source": { + "jmx": { + "value": "{0} + {1}", + "property_list": [ + "Hadoop:service=NameNode,name=FSNamesystemState/NumDeadDataNodes", + "Hadoop:service=NameNode,name=FSNamesystemState/NumStaleDataNodes", + "Hadoop:service=NameNode,name=FSNamesystemState/NumLiveDataNodes" + ] + }, + "reporting": { + "units": "DNs", + "warning": { + "text": "DataNode Health: [Live={2}, Stale={1}, Dead={0}]", + "value": 1.0 + }, + "ok": { + "text": "All {2} DataNode(s) are healthy" + }, + "critical": { + "text": "DataNode Health: [Live={2}, Stale={1}, Dead={0}]", + "value": 1.0 + } + }, + "type": "METRIC", + "uri": { + "http": "{{hdfs-site/dfs.namenode.http-address}}", + "https_property_value": "HTTPS_ONLY", + "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}", + "https_property": "{{hdfs-site/dfs.http.policy}}", + "https": "{{hdfs-site/dfs.namenode.https-address}}", + "default_port": 0, + "connection_timeout": 5.0, + "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}", + "high_availability": { + "nameservice": "{{hdfs-site/dfs.internal.nameservices}}", + "http_pattern": "{{hdfs-site/dfs.namenode.http-address.{{ha-nameservice}}.{{alias}}}}", + "https_pattern": "{{hdfs-site/dfs.namenode.https-address.{{ha-nameservice}}.{{alias}}}}", + "alias_key": "{{hdfs-site/dfs.ha.namenodes.{{ha-nameservice}}}}" + } + } + }, + "serviceName": "HDFS", + "scope": "SERVICE", + "enabled": true, + "description": "This service-level alert is triggered if there are unhealthy DataNodes" + }, + { + "ignore_host": false, + "name": "namenode_service_rpc_queue_latency_daily", + "componentName": "NAMENODE", + "interval": 480, + "clusterId": 2, + "uuid": "54f022db-51f6-4193-8aa7-52c22a0f4194", + "label": "NameNode Service RPC Queue Latency (Daily)", + "definitionId": 28, + "source": { + "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py", + "type": "SCRIPT", + "parameters": [{ + "display_name": "Whether active and stanby NameNodes metrics should be merged", + "name": "mergeHaMetrics", + "visibility": "HIDDEN", + "value": "false", + "type": "STRING", + "description": "Whether active and stanby NameNodes metrics should be merged." + }, + { + "display_name": "Time interval in minutes", + "name": "interval", + "visibility": "HIDDEN", + "value": 1440.0, + "type": "NUMERIC", + "description": "Time interval in minutes." + }, + { + "display_name": "AMS application id", + "name": "appId", + "visibility": "HIDDEN", + "value": "NAMENODE", + "type": "STRING", + "description": "The application id used to retrieve the metric." + }, + { + "display_name": "Metric Name", + "name": "metricName", + "visibility": "HIDDEN", + "value": "rpc.rpc.datanode.RpcQueueTimeAvgTime", + "type": "STRING", + "description": "The metric to monitor." + }, + { + "display_name": "Growth Rate", + "name": "metric.deviation.warning.threshold", + "value": 100.0, + "threshold": "WARNING", + "units": "%", + "type": "PERCENT", + "description": "The percentage of RPC queue latency growth." + }, + { + "display_name": "Growth Rate", + "name": "metric.deviation.critical.threshold", + "value": 200.0, + "threshold": "CRITICAL", + "units": "%", + "type": "PERCENT", + "description": "The percentage of RPC queue latency growth." + }, + { + "display_name": "Minimum Latency", + "name": "minimumValue", + "value": 30.0, + "units": "seconds", + "type": "NUMERIC", + "description": "The minimum latency to measure growth." + }, + { + "display_name": "Metric Units", + "name": "metric.units", + "visibility": "HIDDEN", + "value": "MB", + "type": "STRING", + "description": "The units that the metric data points are reported in." + } + ] + }, + "serviceName": "HDFS", + "scope": "ANY", + "enabled": true, + "description": "This service-level alert is triggered if the deviation of RPC latency on datanode port has grown beyond the specified threshold within a day period." + }, + { + "ignore_host": false, + "name": "namenode_client_rpc_processing_latency_daily", + "componentName": "NAMENODE", + "interval": 480, + "clusterId": 2, + "uuid": "19498de1-618b-4097-b916-cc65d6b2b2ca", + "label": "NameNode Client RPC Processing Latency (Daily)", + "definitionId": 30, + "source": { + "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py", + "type": "SCRIPT", + "parameters": [{ + "display_name": "Whether active and stanby NameNodes metrics should be merged", + "name": "mergeHaMetrics", + "visibility": "HIDDEN", + "value": "false", + "type": "STRING", + "description": "Whether active and stanby NameNodes metrics should be merged." + }, + { + "display_name": "Time interval in minutes", + "name": "interval", + "visibility": "HIDDEN", + "value": 1440.0, + "type": "NUMERIC", + "description": "Time interval in minutes." + }, + { + "display_name": "AMS application id", + "name": "appId", + "visibility": "HIDDEN", + "value": "NAMENODE", + "type": "STRING", + "description": "The application id used to retrieve the metric." + }, + { + "display_name": "Metric Name", + "name": "metricName", + "visibility": "HIDDEN", + "value": "rpc.rpc.client.RpcProcessingTimeAvgTime", + "type": "STRING", + "description": "The metric to monitor." + }, + { + "display_name": "Growth Rate", + "name": "metric.deviation.warning.threshold", + "value": 100.0, + "threshold": "WARNING", + "units": "%", + "type": "PERCENT", + "description": "The percentage of RPC processing latency growth." + }, + { + "display_name": "Growth Rate", + "name": "metric.deviation.critical.threshold", + "value": 200.0, + "threshold": "CRITICAL", + "units": "%", + "type": "PERCENT", + "description": "The percentage of RPC processing latency growth." + }, + { + "display_name": "Minimum Latency", + "name": "minimumValue", + "value": 30.0, + "units": "seconds", + "type": "NUMERIC", + "description": "The minimum latency to measure growth." + }, + { + "display_name": "Metric Units", + "name": "metric.units", + "visibility": "HIDDEN", + "value": "ms", + "type": "STRING", + "description": "The units that the metric data points are reported in." + } + ] + }, + "serviceName": "HDFS", + "scope": "ANY", + "enabled": true, + "description": "This service-level alert is triggered if the deviation of RPC latency on client port has grown beyond the specified threshold within a day period." + }, + { + "ignore_host": false, + "name": "namenode_hdfs_blocks_health", + "componentName": "NAMENODE", + "interval": 2, + "clusterId": 2, + "uuid": "47817ad5-f654-46a6-9f72-482e527394a9", + "label": "NameNode Blocks Health", + "definitionId": 31, + "source": { + "jmx": { + "value": "{0}", + "property_list": [ + "Hadoop:service=NameNode,name=FSNamesystem/MissingBlocks", + "Hadoop:service=NameNode,name=FSNamesystem/BlocksTotal" + ] + }, + "reporting": { + "units": "Blocks", + "warning": { + "text": "Total Blocks:[{1}], Missing Blocks:[{0}]", + "value": 1.0 + }, + "ok": { + "text": "Total Blocks:[{1}], Missing Blocks:[{0}]" + }, + "critical": { + "text": "Total Blocks:[{1}], Missing Blocks:[{0}]", + "value": 1.0 + } + }, + "type": "METRIC", + "uri": { + "http": "{{hdfs-site/dfs.namenode.http-address}}", + "https_property_value": "HTTPS_ONLY", + "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}", + "https_property": "{{hdfs-site/dfs.http.policy}}", + "https": "{{hdfs-site/dfs.namenode.https-address}}", + "default_port": 0, + "connection_timeout": 5.0, + "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}", + "high_availability": { + "nameservice": "{{hdfs-site/dfs.internal.nameservices}}", + "http_pattern": "{{hdfs-site/dfs.namenode.http-address.{{ha-nameservice}}.{{alias}}}}", + "https_pattern": "{{hdfs-site/dfs.namenode.https-address.{{ha-nameservice}}.{{alias}}}}", + "alias_key": "{{hdfs-site/dfs.ha.namenodes.{{ha-nameservice}}}}" + } + } + }, + "serviceName": "HDFS", + "scope": "ANY", + "enabled": true, + "description": "This service-level alert is triggered if the number of corrupt or missing blocks exceeds the configured critical threshold. The threshold values are in blocks." + }, + { + "ignore_host": false, + "name": "namenode_webui", + "componentName": "NAMENODE", + "interval": 1, + "clusterId": 2, + "uuid": "edae319f-c785-4386-ad8b-df12a3ed4854", + "label": "NameNode Web UI", + "definitionId": 32, + "source": { + "reporting": { + "warning": { + "text": "HTTP {0} response from {1} in {2:.3f}s ({3})" + }, + "ok": { + "text": "HTTP {0} response in {2:.3f}s" + }, + "critical": { + "text": "Connection failed to {1} ({3})" + } + }, + "type": "WEB", + "uri": { + "http": "{{hdfs-site/dfs.namenode.http-address}}", + "https_property_value": "HTTPS_ONLY", + "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}", + "https_property": "{{hdfs-site/dfs.http.policy}}", + "https": "{{hdfs-site/dfs.namenode.https-address}}", + "default_port": 0, + "connection_timeout": 5.0, + "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}", + "high_availability": { + "nameservice": "{{hdfs-site/dfs.internal.nameservices}}", + "http_pattern": "{{hdfs-site/dfs.namenode.http-address.{{ha-nameservice}}.{{alias}}}}", + "https_pattern": "{{hdfs-site/dfs.namenode.https-address.{{ha-nameservice}}.{{alias}}}}", + "alias_key": "{{hdfs-site/dfs.ha.namenodes.{{ha-nameservice}}}}" + } + } + }, + "serviceName": "HDFS", + "scope": "ANY", + "enabled": true, + "description": "This host-level alert is triggered if the NameNode Web UI is unreachable." + }, + { + "ignore_host": false, + "name": "namenode_service_rpc_processing_latency_hourly", + "componentName": "NAMENODE", + "interval": 5, + "clusterId": 2, + "uuid": "e0826fd1-091e-463c-b3b2-38ded95acef7", + "label": "NameNode Service RPC Processing Latency (Hourly)", + "definitionId": 35, + "source": { + "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py", + "type": "SCRIPT", + "parameters": [{ + "display_name": "Whether active and stanby NameNodes metrics should be merged", + "name": "mergeHaMetrics", + "visibility": "HIDDEN", + "value": "false", + "type": "STRING", + "description": "Whether active and stanby NameNodes metrics should be merged." + }, + { + "display_name": "Time interval in minutes", + "name": "interval", + "visibility": "HIDDEN", + "value": 60.0, + "type": "NUMERIC", + "description": "Time interval in minutes." + }, + { + "display_name": "AMS application id", + "name": "appId", + "visibility": "HIDDEN", + "value": "NAMENODE", + "type": "STRING", + "description": "The application id used to retrieve the metric." + }, + { + "display_name": "Metric Name", + "name": "metricName", + "visibility": "HIDDEN", + "value": "rpc.rpc.datanode.RpcProcessingTimeAvgTime", + "type": "STRING", + "description": "The metric to monitor." + }, + { + "display_name": "Growth Rate", + "name": "metric.deviation.warning.threshold", + "value": 100.0, + "threshold": "WARNING", + "units": "%", + "type": "PERCENT", + "description": "The percentage of RPC processing latency growth." + }, + { + "display_name": "Growth Rate", + "name": "metric.deviation.critical.threshold", + "value": 200.0, + "threshold": "CRITICAL", + "units": "%", + "type": "PERCENT", + "description": "The percentage of RPC processing latency growth." + }, + { + "display_name": "Minimum Latency", + "name": "minimumValue", + "value": 30.0, + "units": "seconds", + "type": "NUMERIC", + "description": "The minimum latency to measure growth." + }, + { + "display_name": "Metric Units", + "name": "metric.units", + "visibility": "HIDDEN", + "value": "ms", + "type": "STRING", + "description": "The units that the metric data points are reported in." + } + ] + }, + "serviceName": "HDFS", + "scope": "ANY", + "enabled": true, + "description": "This service-level alert is triggered if the deviation of RPC latency on datanode port has grown beyond the specified threshold within an hour period." + }, + { + "ignore_host": false, + "name": "nfsgateway_process", + "componentName": "NFS_GATEWAY", + "interval": 1, + "clusterId": 2, + "uuid": "2b1e103d-5dd2-45b5-86ec-3ea6a0fb6de3", + "label": "NFS Gateway Process", + "definitionId": 36, + "source": { + "reporting": { + "warning": { + "text": "TCP OK - {0:.3f}s response on port {1}", + "value": 1.5 + }, + "ok": { + "text": "TCP OK - {0:.3f}s response on port {1}" + }, + "critical": { + "text": "Connection failed: {0} to {1}:{2}", + "value": 5.0 + } + }, + "type": "PORT", + "uri": "{{hdfs-site/nfs.server.port}}", + "default_port": 2049 + }, + "serviceName": "HDFS", + "scope": "HOST", + "enabled": true, + "description": "This host-level alert is triggered if the NFS Gateway process cannot be confirmed to be up and listening on the network." + }, + { + "ignore_host": false, + "name": "journalnode_process_percent", + "enabled": true, + "interval": 1, + "clusterId": 2, + "uuid": "abff90f2-5feb-4d19-a89e-68a9e8a0a300", + "label": "Percent JournalNodes Available", + "definitionId": 37, + "source": { + "alert_name": "journalnode_process", + "reporting": { + "units": "%", + "type": "PERCENT", + "warning": { + "text": "affected: [{1}], total: [{0}]", + "value": 33.0 + }, + "ok": { + "text": "affected: [{1}], total: [{0}]" + }, + "critical": { + "text": "affected: [{1}], total: [{0}]", + "value": 50.0 + } + }, + "type": "AGGREGATE" + }, + "serviceName": "HDFS", + "scope": "SERVICE", + "description": "This alert is triggered if the number of down JournalNodes in the cluster is greater than the configured critical threshold. It aggregates the results of JournalNode process checks." + }, + { + "ignore_host": false, + "name": "namenode_increase_in_storage_capacity_usage_daily", + "componentName": "NAMENODE", + "interval": 480, + "clusterId": 2, + "uuid": "988253fe-3834-4c95-91f3-24d1e62fb1ac", + "label": "HDFS Storage Capacity Usage (Daily)", + "definitionId": 38, + "source": { + "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py", + "type": "SCRIPT", + "parameters": [{ + "display_name": "Whether active and stanby NameNodes metrics should be merged", + "name": "mergeHaMetrics", + "visibility": "HIDDEN", + "value": "false", + "type": "STRING", + "description": "Whether active and stanby NameNodes metrics should be merged." + }, + { + "display_name": "Time interval in minutes", + "name": "interval", + "visibility": "HIDDEN", + "value": 1440.0, + "type": "NUMERIC", + "description": "Time interval in minutes." + }, + { + "display_name": "AMS application id", + "name": "appId", + "visibility": "HIDDEN", + "value": "NAMENODE", + "type": "STRING", + "description": "The application id used to retrieve the metric." + }, + { + "display_name": "Metric Name", + "name": "metricName", + "visibility": "HIDDEN", + "value": "dfs.FSNamesystem.CapacityUsed", + "type": "STRING", + "description": "The metric to monitor." + }, + { + "display_name": "Growth Rate", + "name": "metric.deviation.warning.threshold", + "value": 30.0, + "threshold": "WARNING", + "units": "%", + "type": "PERCENT", + "description": "The percentage of storage capacity usage growth." + }, + { + "display_name": "Growth Rate", + "name": "metric.deviation.critical.threshold", + "value": 50.0, + "threshold": "CRITICAL", + "units": "%", + "type": "PERCENT", + "description": "The percentage of storage capacity usage growth." + }, + { + "display_name": "Metric Units", + "name": "metric.units", + "visibility": "HIDDEN", + "value": "B", + "type": "STRING", + "description": "The units that the metric data points are reported in." + }, + { + "display_name": "Minimum Capacity", + "name": "minimumValue", + "value": 100.0, + "units": "MB", + "type": "NUMERIC", + "description": "The minimum capacity increase in a day." + } + ] + }, + "serviceName": "HDFS", + "scope": "ANY", + "enabled": true, + "description": "This service-level alert is triggered if the increase in storage capacity usage deviation has grown beyond the specified threshold within a day period." + }, + { + "ignore_host": false, + "name": "namenode_client_rpc_queue_latency_hourly", + "componentName": "NAMENODE", + "interval": 5, + "clusterId": 2, + "uuid": "02110587-22f1-42ed-8411-8b488dca7342", + "label": "NameNode Client RPC Queue Latency (Hourly)", + "definitionId": 39, + "source": { + "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py", + "type": "SCRIPT", + "parameters": [{ + "display_name": "Whether active and stanby NameNodes metrics should be merged", + "name": "mergeHaMetrics", + "visibility": "HIDDEN", + "value": "false", + "type": "STRING", + "description": "Whether active and stanby NameNodes metrics should be merged." + }, + { + "display_name": "Time interval in minutes", + "name": "interval", + "visibility": "HIDDEN", + "value": 60.0, + "type": "NUMERIC", + "description": "Time interval in minutes." + }, + { + "display_name": "AMS application id", + "name": "appId", + "visibility": "HIDDEN", + "value": "NAMENODE", + "type": "STRING", + "description": "The application id used to retrieve the metric." + }, + { + "display_name": "Metric Name", + "name": "metricName", + "visibility": "HIDDEN", + "value": "rpc.rpc.client.RpcQueueTimeAvgTime", + "type": "STRING", + "description": "The metric to monitor." + }, + { + "display_name": "Growth Rate", + "name": "metric.deviation.warning.threshold", + "value": 100.0, + "threshold": "WARNING", + "units": "%", + "type": "PERCENT", + "description": "The percentage of RPC queue latency growth." + }, + { + "display_name": "Growth Rate", + "name": "metric.deviation.critical.threshold", + "value": 200.0, + "threshold": "CRITICAL", + "units": "%", + "type": "PERCENT", + "description": "The percentage of RPC queue latency growth." + }, + { + "display_name": "Minimum Latency", + "name": "minimumValue", + "value": 30.0, + "units": "seconds", + "type": "NUMERIC", + "description": "The minimum latency to measure growth." + }, + { + "display_name": "Metric Units", + "name": "metric.units", + "visibility": "HIDDEN", + "value": "ms", + "type": "STRING", + "description": "The units that the metric data points are reported in." + } + ] + }, + "serviceName": "HDFS", + "scope": "ANY", + "enabled": true, + "description": "This service-level alert is triggered if the deviation of RPC queue latency on client port has grown beyond the specified threshold within an hour period." + }, + { + "ignore_host": false, + "name": "datanode_storage_percent", + "enabled": true, + "interval": 1, + "clusterId": 2, + "uuid": "862dd1c8-0d15-435b-8adc-96e113bd8477", + "label": "Percent DataNodes With Available Space", + "definitionId": 40, + "source": { + "alert_name": "datanode_storage", + "reporting": { + "units": "%", + "type": "PERCENT", + "warning": { + "text": "affected: [{1}], total: [{0}]", + "value": 10.0 + }, + "ok": { + "text": "affected: [{1}], total: [{0}]" + }, + "critical": { + "text": "affected: [{1}], total: [{0}]", + "value": 30.0 + } + }, + "type": "AGGREGATE" + }, + "serviceName": "HDFS", + "scope": "SERVICE", + "description": "This service-level alert is triggered if the storage on a certain percentage of DataNodes exceeds either the warning or critical threshold values." + }, + { + "ignore_host": false, + "name": "namenode_service_rpc_processing_latency_daily", + "componentName": "NAMENODE", + "interval": 480, + "clusterId": 2, + "uuid": "c2919f99-c413-4d70-b58c-29dbce9f50c7", + "label": "NameNode Service RPC Processing Latency (Daily)", + "definitionId": 41, + "source": { + "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py", + "type": "SCRIPT", + "parameters": [{ + "display_name": "Whether active and stanby NameNodes metrics should be merged", + "name": "mergeHaMetrics", + "visibility": "HIDDEN", + "value": "false", + "type": "STRING", + "description": "Whether active and stanby NameNodes metrics should be merged." + }, + { + "display_name": "Time interval in minutes", + "name": "interval", + "visibility": "HIDDEN", + "value": 1440.0, + "type": "NUMERIC", + "description": "Time interval in minutes." + }, + { + "display_name": "AMS application id", + "name": "appId", + "visibility": "HIDDEN", + "value": "NAMENODE", + "type": "STRING", + "description": "The application id used to retrieve the metric." + }, + { + "display_name": "Metric Name", + "name": "metricName", + "visibility": "HIDDEN", + "value": "rpc.rpc.datanode.RpcProcessingTimeAvgTime", + "type": "STRING", + "description": "The metric to monitor." + }, + { + "display_name": "Growth Rate", + "name": "metric.deviation.warning.threshold", + "value": 100.0, + "threshold": "WARNING", + "units": "%", + "type": "PERCENT", + "description": "The percentage of RPC processing latency growth." + }, + { + "display_name": "Growth Rate", + "name": "metric.deviation.critical.threshold", + "value": 200.0, + "threshold": "CRITICAL", + "units": "%", + "type": "PERCENT", + "description": "The percentage of RPC processing latency growth." + }, + { + "display_name": "Minimum Latency", + "name": "minimumValue", + "value": 30.0, + "units": "seconds", + "type": "NUMERIC", + "description": "The minimum latency to measure growth." + }, + { + "display_name": "Metric Units", + "name": "metric.units", + "visibility": "HIDDEN", + "value": "ms", + "type": "STRING", + "description": "The units that the metric data points are reported in." + } + ] + }, + "serviceName": "HDFS", + "scope": "ANY", + "enabled": true, + "description": "This service-level alert is triggered if the deviation of RPC latency on datanode port has grown beyond the specified threshold within a day period." + }, + { + "ignore_host": false, + "name": "upgrade_finalized_state", + "componentName": "NAMENODE", + "interval": 1, + "clusterId": 2, + "uuid": "efd896ec-4a61-4622-bac7-a2008cb9b42a", + "label": "HDFS Upgrade Finalized State", + "definitionId": 42, + "source": { + "path": "HDFS/2.1.0.2.0/package/alerts/alert_upgrade_finalized.py", + "type": "SCRIPT", + "parameters": [ + + ] + }, + "serviceName": "HDFS", + "scope": "SERVICE", + "enabled": true, + "description": "This service-level alert is triggered if HDFS is not in the finalized state" + }, + { + "ignore_host": false, + "name": "namenode_client_rpc_processing_latency_hourly", + "componentName": "NAMENODE", + "interval": 5, + "clusterId": 2, + "uuid": "2b939d84-fc92-4b6b-88a7-b5c5d0151040", + "label": "NameNode Client RPC Processing Latency (Hourly)", + "definitionId": 43, + "source": { + "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py", + "type": "SCRIPT", + "parameters": [{ + "display_name": "Whether active and stanby NameNodes metrics should be merged", + "name": "mergeHaMetrics", + "visibility": "HIDDEN", + "value": "false", + "type": "STRING", + "description": "Whether active and stanby NameNodes metrics should be merged." + }, + { + "display_name": "Time interval in minutes", + "name": "interval", + "visibility": "HIDDEN", + "value": 60.0, + "type": "NUMERIC", + "description": "Time interval in minutes." + }, + { + "display_name": "AMS application id", + "name": "appId", + "visibility": "HIDDEN", + "value": "NAMENODE", + "type": "STRING", + "description": "The application id used to retrieve the metric." + }, + { + "display_name": "Metric Name", + "name": "metricName", + "visibility": "HIDDEN", + "value": "rpc.rpc.client.RpcProcessingTimeAvgTime", + "type": "STRING", + "description": "The metric to monitor." + }, + { + "display_name": "Growth Rate", + "name": "metric.deviation.warning.threshold", + "value": 100.0, + "threshold": "WARNING", + "units": "%", + "type": "PERCENT", + "description": "The percentage of RPC processing latency growth." + }, + { + "display_name": "Growth Rate", + "name": "metric.deviation.critical.threshold", + "value": 200.0, + "threshold": "CRITICAL", + "units": "%", + "type": "PERCENT", + "description": "The percentage of RPC processing latency growth." + }, + { + "display_name": "Minimum Latency", + "name": "minimumValue", + "value": 30.0, + "units": "seconds", + "type": "NUMERIC", + "description": "The minimum latency to measure growth." + }, + { + "display_name": "Metric Units", + "name": "metric.units", + "visibility": "HIDDEN", + "value": "ms", + "type": "STRING", + "description": "The units that the metric data points are reported in." + } + ] + }, + "serviceName": "HDFS", + "scope": "ANY", + "enabled": true, + "description": "This service-level alert is triggered if the deviation of RPC latency on client port has grown beyond the specified threshold within an hour period." + }, + { + "ignore_host": false, + "name": "namenode_increase_in_storage_capacity_usage_weekly", + "componentName": "NAMENODE", + "interval": 1440, + "clusterId": 2, + "uuid": "aa5c734e-791f-40cc-8f94-2d4a2e4dd7ae", + "label": "HDFS Storage Capacity Usage (Weekly)", + "definitionId": 44, + "source": { + "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py", + "type": "SCRIPT", + "parameters": [{ + "display_name": "Whether active and stanby NameNodes metrics should be merged", + "name": "mergeHaMetrics", + "visibility": "HIDDEN", + "value": "false", + "type": "STRING", + "description": "Whether active and stanby NameNodes metrics should be merged." + }, + { + "display_name": "Time interval in minutes", + "name": "interval", + "visibility": "HIDDEN", + "value": 10080.0, + "type": "NUMERIC", + "description": "Time interval in minutes." + }, + { + "display_name": "AMS application id", + "name": "appId", + "visibility": "HIDDEN", + "value": "NAMENODE", + "type": "STRING", + "description": "The application id used to retrieve the metric." + }, + { + "display_name": "Metric Name", + "name": "metricName", + "visibility": "HIDDEN", + "value": "dfs.FSNamesystem.CapacityUsed", + "type": "STRING", + "description": "The metric to monitor." + }, + { + "display_name": "Growth Rate", + "name": "metric.deviation.warning.threshold", + "value": 10.0, + "threshold": "WARNING", + "units": "%", + "type": "PERCENT", + "description": "The percentage of storage capacity usage growth." + }, + { + "display_name": "Growth Rate", + "name": "metric.deviation.critical.threshold", + "value": 20.0, + "threshold": "CRITICAL", + "units": "%", + "type": "PERCENT", + "description": "The percentage of storage capacity usage growth." + }, + { + "display_name": "Metric Units", + "name": "metric.units", + "visibility": "HIDDEN", + "value": "B", + "type": "STRING", + "description": "The units that the metric data points are reported in." + }, + { + "display_name": "Minimum Capacity", + "name": "minimumValue", + "value": 1000.0, + "units": "MB", + "type": "NUMERIC", + "description": "The minimum capacity increase in a week." + } + ] + }, + "serviceName": "HDFS", + "scope": "ANY", + "enabled": true, + "description": "This service-level alert is triggered if the increase in storage capacity usage deviation has grown beyond the specified threshold within a week period." + }, + { + "ignore_host": false, + "name": "journalnode_process", + "componentName": "JOURNALNODE", + "interval": 1, + "clusterId": 2, + "uuid": "709a69ae-ef63-4ec4-ba68-3de27f6a25bb", + "label": "JournalNode Web UI", + "definitionId": 45, + "source": { + "reporting": { + "warning": { + "text": "HTTP {0} response from {1} in {2:.3f}s ({3})" + }, + "ok": { + "text": "HTTP {0} response in {2:.3f}s" + }, + "critical": { + "text": "Connection failed to {1} ({3})" + } + }, + "type": "WEB", + "uri": { + "http": "{{hdfs-site/dfs.journalnode.http-address}}", + "https_property_value": "HTTPS_ONLY", + "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}", + "https_property": "{{hdfs-site/dfs.http.policy}}", + "https": "{{hdfs-site/dfs.journalnode.https-address}}", + "default_port": 0, + "connection_timeout": 5.0, + "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}" + } + }, + "serviceName": "HDFS", + "scope": "HOST", + "enabled": true, + "description": "This host-level alert is triggered if the JournalNode Web UI is unreachable." + }, + { + "ignore_host": false, + "name": "hdfs_zookeeper_failover_controller_process", + "componentName": "ZKFC", + "interval": 1, + "clusterId": 2, + "uuid": "31432389-cfe0-4b28-8660-7b69243698e8", + "label": "ZooKeeper Failover Controller Process", + "definitionId": 46, + "source": { + "reporting": { + "warning": { + "text": "TCP OK - {0:.3f}s response on port {1}", + "value": 1.5 + }, + "ok": { + "text": "TCP OK - {0:.3f}s response on port {1}" + }, + "critical": { + "text": "Connection failed: {0} to {1}:{2}", + "value": 5.0 + } + }, + "type": "PORT", + "uri": "{{hdfs-site/dfs.ha.zkfc.port}}", + "default_port": 8019 + }, + "serviceName": "HDFS", + "scope": "ANY", + "enabled": true, + "description": "This host-level alert is triggered if the ZooKeeper Failover Controller process cannot be confirmed to be up and listening on the network." + }, + { + "ignore_host": false, + "name": "namenode_directory_status", + "componentName": "NAMENODE", + "interval": 1, + "clusterId": 2, + "uuid": "a4142c8f-5f2d-4f75-be0d-8dbbc9b312d8", + "label": "NameNode Directory Status", + "definitionId": 47, + "source": { + "jmx": { + "value": "calculate(args)\ndef calculate(args):\n import json\n json_statuses = json.loads({0})\n return len(json_statuses['failed']) if 'failed' in json_statuses else 0", + "property_list": [ + "Hadoop:service=NameNode,name=NameNodeInfo/NameDirStatuses" + ] + }, + "reporting": { + "units": "Dirs", + "warning": { + "text": "Failed directory count: {1}", + "value": 1.0 + }, + "ok": { + "text": "Directories are healthy" + }, + "critical": { + "text": "Failed directory count: {1}", + "value": 1.0 + } + }, + "type": "METRIC", + "uri": { + "http": "{{hdfs-site/dfs.namenode.http-address}}", + "https_property_value": "HTTPS_ONLY", + "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}", + "https_property": "{{hdfs-site/dfs.http.policy}}", + "https": "{{hdfs-site/dfs.namenode.https-address}}", + "default_port": 0, + "connection_timeout": 5.0, + "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}", + "high_availability": { + "nameservice": "{{hdfs-site/dfs.internal.nameservices}}", + "http_pattern": "{{hdfs-site/dfs.namenode.http-address.{{ha-nameservice}}.{{alias}}}}", + "https_pattern": "{{hdfs-site/dfs.namenode.https-address.{{ha-nameservice}}.{{alias}}}}", + "alias_key": "{{hdfs-site/dfs.ha.namenodes.{{ha-nameservice}}}}" + } + } + }, + "serviceName": "HDFS", + "scope": "ANY", + "enabled": true, + "description": "This host-level alert is triggered if the NameNode NameDirStatuses metric (name=NameNodeInfo/NameDirStatuses) reports a failed directory. The threshold values are in the number of directories that are not healthy." + }, + { + "ignore_host": false, + "name": "namenode_service_rpc_queue_latency_hourly", + "componentName": "NAMENODE", + "interval": 5, + "clusterId": 2, + "uuid": "8aceab50-916f-4642-9bb7-811cbccb5c46", + "label": "NameNode Service RPC Queue Latency (Hourly)", + "definitionId": 48, + "source": { + "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py", + "type": "SCRIPT", + "parameters": [{ + "display_name": "Whether active and stanby NameNodes metrics should be merged", + "name": "mergeHaMetrics", + "visibility": "HIDDEN", + "value": "false", + "type": "STRING", + "description": "Whether active and stanby NameNodes metrics should be merged." + }, + { + "display_name": "Time interval in minutes", + "name": "interval", + "visibility": "HIDDEN", + "value": 60.0, + "type": "NUMERIC", + "description": "Time interval in minutes." + }, + { + "display_name": "AMS application id", + "name": "appId", + "visibility": "HIDDEN", + "value": "NAMENODE", + "type": "STRING", + "description": "The application id used to retrieve the metric." + }, + { + "display_name": "Metric Name", + "name": "metricName", + "visibility": "HIDDEN", + "value": "rpc.rpc.datanode.RpcQueueTimeAvgTime", + "type": "STRING", + "description": "The metric to monitor." + }, + { + "display_name": "Growth Rate", + "name": "metric.deviation.warning.threshold", + "value": 100.0, + "threshold": "WARNING", + "units": "%", + "type": "PERCENT", + "description": "The percentage of RPC queue latency growth." + }, + { + "display_name": "Growth Rate", + "name": "metric.deviation.critical.threshold", + "value": 200.0, + "threshold": "CRITICAL", + "units": "%", + "type": "PERCENT", + "description": "The percentage of RPC queue latency growth." + }, + { + "display_name": "Minimum Latency", + "name": "minimumValue", + "value": 30.0, + "units": "seconds", + "type": "NUMERIC", + "description": "The minimum latency to measure growth." + }, + { + "display_name": "Metric Units", + "name": "metric.units", + "visibility": "HIDDEN", + "value": "ms", + "type": "STRING", + "description": "The units that the metric data points are reported in." + } + ] + }, + "serviceName": "HDFS", + "scope": "ANY", + "enabled": true, + "description": "This service-level alert is triggered if the deviation of RPC queue latency on datanode port has grown beyond the specified threshold within an hour period." + }, + { + "ignore_host": false, + "name": "increase_nn_heap_usage_weekly", + "componentName": "NAMENODE", + "interval": 1440, + "clusterId": 2, + "uuid": "8ca3a9de-1d43-40e3-bdad-d28cb59921a6", + "label": "NameNode Heap Usage (Weekly)", + "definitionId": 49, + "source": { + "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py", + "type": "SCRIPT", + "parameters": [{ + "display_name": "Whether active and stanby NameNodes metrics should be merged", + "name": "mergeHaMetrics", + "visibility": "HIDDEN", + "value": "false", + "type": "STRING", + "description": "Whether active and stanby NameNodes metrics should be merged." + }, + { + "display_name": "Time interval in minutes", + "name": "interval", + "visibility": "HIDDEN", + "value": 10080.0, + "type": "NUMERIC", + "description": "Time interval in minutes." + }, + { + "display_name": "AMS application id", + "name": "appId", + "visibility": "HIDDEN", + "value": "NAMENODE", + "type": "STRING", + "description": "The application id used to retrieve the metric." + }, + { + "display_name": "Metric Name", + "name": "metricName", + "visibility": "HIDDEN", + "value": "jvm.JvmMetrics.MemHeapUsedM", + "type": "STRING", + "description": "The metric to monitor." + }, + { + "display_name": "Growth Rate", + "name": "metric.deviation.warning.threshold", + "value": 20.0, + "threshold": "WARNING", + "units": "%", + "type": "PERCENT", + "description": "The percentage of NameNode heap usage growth." + }, + { + "display_name": "Growth Rate", + "name": "metric.deviation.critical.threshold", + "value": 50.0, + "threshold": "CRITICAL", + "units": "%", + "type": "PERCENT", + "description": "The percentage of NameNode heap usage growth." + }, + { + "display_name": "Metric Units", + "name": "metric.units", + "visibility": "HIDDEN", + "value": "MB", + "type": "STRING", + "description": "The units that the metric data points are reported in." + }, + { + "display_name": "Minimum Heap", + "name": "minimumValue", + "value": 1000.0, + "units": "MB", + "type": "NUMERIC", + "description": "The minimum heap increase in a week." + } + ] + }, + "serviceName": "HDFS", + "scope": "ANY", + "enabled": true, + "description": "This service-level alert is triggered if the NameNode heap usage deviation has grown beyond the specified threshold within a week period." + }, + { + "ignore_host": false, + "name": "datanode_process_percent", + "enabled": true, + "interval": 1, + "clusterId": 2, + "uuid": "981b9a0e-09d8-4e50-98a6-307dde57555b", + "label": "Percent DataNodes Available", + "definitionId": 50, + "source": { + "alert_name": "datanode_process", + "reporting": { + "units": "%", + "type": "PERCENT", + "warning": { + "text": "affected: [{1}], total: [{0}]", + "value": 10.0 + }, + "ok": { + "text": "affected: [{1}], total: [{0}]" + }, + "critical": { + "text": "affected: [{1}], total: [{0}]", + "value": 30.0 + } + }, + "type": "AGGREGATE" + }, + "serviceName": "HDFS", + "scope": "SERVICE", + "description": "This alert is triggered if the number of down DataNodes in the cluster is greater than the configured critical threshold. It aggregates the results of DataNode process checks." + }, + { + "ignore_host": false, + "name": "namenode_hdfs_capacity_utilization", + "componentName": "NAMENODE", + "interval": 2, + "clusterId": 2, + "uuid": "ce199698-13f1-408f-9615-b00e24533c0d", + "label": "HDFS Capacity Utilization", + "definitionId": 51, + "source": { + "jmx": { + "value": "{0}/({0} + {1}) * 100.0", + "property_list": [ + "Hadoop:service=NameNode,name=FSNamesystemState/CapacityUsed", + "Hadoop:service=NameNode,name=FSNamesystemState/CapacityRemaining" + ] + }, + "reporting": { + "units": "%", + "type": "PERCENT", + "warning": { + "text": "Capacity Used:[{2:.0f}%, {0}], Capacity Remaining:[{1}]", + "value": 75.0 + }, + "ok": { + "text": "Capacity Used:[{2:.0f}%, {0}], Capacity Remaining:[{1}]" + }, + "critical": { + "text": "Capacity Used:[{2:.0f}%, {0}], Capacity Remaining:[{1}]", + "value": 80.0 + } + }, + "type": "METRIC", + "uri": { + "http": "{{hdfs-site/dfs.namenode.http-address}}", + "https_property_value": "HTTPS_ONLY", + "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}", + "https_property": "{{hdfs-site/dfs.http.policy}}", + "https": "{{hdfs-site/dfs.namenode.https-address}}", + "default_port": 0, + "connection_timeout": 5.0, + "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}", + "high_availability": { + "nameservice": "{{hdfs-site/dfs.internal.nameservices}}", + "http_pattern": "{{hdfs-site/dfs.namenode.http-address.{{ha-nameservice}}.{{alias}}}}", + "https_pattern": "{{hdfs-site/dfs.namenode.https-address.{{ha-nameservice}}.{{alias}}}}", + "alias_key": "{{hdfs-site/dfs.ha.namenodes.{{ha-nameservice}}}}" + } + } + }, + "serviceName": "HDFS", + "scope": "ANY", + "enabled": true, + "description": "This service-level alert is triggered if the HDFS capacity utilization exceeds the configured warning and critical thresholds. It checks the NameNode JMX Servlet for the CapacityUsed and CapacityRemaining properties. The threshold values are in percent." + }, + { + "ignore_host": false, + "name": "namenode_rpc_latency", + "componentName": "NAMENODE", + "interval": 2, + "clusterId": 2, + "uuid": "5093fa60-77d4-461a-bb8f-c2c01da3a2ce", + "label": "NameNode RPC Latency", + "definitionId": 52, + "source": { + "jmx": { + "value": "{0}", + "property_list": [ + "Hadoop:service=NameNode,name=RpcActivityForPort*/RpcQueueTimeAvgTime", + "Hadoop:service=NameNode,name=RpcActivityForPort*/RpcProcessingTimeAvgTime" + ] + }, + "reporting": { + "units": "ms", + "warning": { + "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]", + "value": 3000.0 + }, + "ok": { + "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]" + }, + "critical": { + "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]", + "value": 5000.0 + } + }, + "type": "METRIC", + "uri": { + "http": "{{hdfs-site/dfs.namenode.http-address}}", + "https_property_value": "HTTPS_ONLY", + "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}", + "https_property": "{{hdfs-site/dfs.http.policy}}", + "https": "{{hdfs-site/dfs.namenode.https-address}}", + "default_port": 0, + "connection_timeout": 5.0, + "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}", + "high_availability": { + "nameservice": "{{hdfs-site/dfs.internal.nameservices}}", + "http_pattern": "{{hdfs-site/dfs.namenode.http-address.{{ha-nameservice}}.{{alias}}}}", + "https_pattern": "{{hdfs-site/dfs.namenode.https-address.{{ha-nameservice}}.{{alias}}}}", + "alias_key": "{{hdfs-site/dfs.ha.namenodes.{{ha-nameservice}}}}" + } + } + }, + "serviceName": "HDFS", + "scope": "ANY", + "enabled": true, + "description": "This host-level alert is triggered if the NameNode RPC latency exceeds the configured critical threshold. Typically an increase in the RPC processing time increases the RPC queue length, causing the average queue wait time to increase for NameNode operations. The threshold values are in milliseconds." + }, + { + "ignore_host": false, + "name": "namenode_last_checkpoint", + "componentName": "NAMENODE", + "interval": 1, + "clusterId": 2, + "uuid": "6d4c3c01-4971-4077-9dd2-0e4cf8f54573", + "label": "NameNode Last Checkpoint", + "definitionId": 53, + "source": { + "path": "HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py", + "type": "SCRIPT", + "parameters": [{ + "display_name": "Connection Timeout", + "name": "connection.timeout", + "value": 5.0, + "threshold": "CRITICAL", + "units": "seconds", + "type": "NUMERIC", + "description": "The maximum time before this alert is considered to be CRITICAL" + }, + { + "display_name": "Checkpoint Warning", + "name": "checkpoint.time.warning.threshold", + "value": 200.0, + "threshold": "WARNING", + "units": "%", + "type": "PERCENT", + "description": "The percentage of the last checkpoint time greater than the interval in order to trigger a warning alert." + }, + { + "display_name": "Checkpoint Critical", + "name": "checkpoint.time.critical.threshold", + "value": 200.0, + "threshold": "CRITICAL", + "units": "%", + "type": "PERCENT", + "description": "The percentage of the last checkpoint time greater than the interval in order to trigger a critical alert." + }, + { + "display_name": "Uncommitted transactions Warning", + "name": "checkpoint.txns.multiplier.warning.threshold", + "value": 2.0, + "threshold": "WARNING", + "type": "NUMERIC", + "description": "The multiplier to use against dfs.namenode.checkpoint.period compared to the difference between last transaction id and most recent transaction id beyond which to trigger a warning alert." + }, + { + "display_name": "Uncommitted transactions Critical", + "name": "checkpoint.txns.multiplier.critical.threshold", + "value": 4.0, + "threshold": "CRITICAL", + "type": "NUMERIC", + "description": "The multiplier to use against dfs.namenode.checkpoint.period compared to the difference between last transaction id and most recent transaction id beyond which to trigger a critical alert." + } + ] + }, + "serviceName": "HDFS", + "scope": "ANY", + "enabled": true, + "description": "This service-level alert will trigger if the last time that the NameNode performed a checkpoint was too long ago. It will also trigger if the number of uncommitted transactions is beyond a certain threshold." + }, + { + "ignore_host": false, + "name": "increase_nn_heap_usage_daily", + "componentName": "NAMENODE", + "interval": 480, + "clusterId": 2, + "uuid": "dc701c46-b8a9-42dd-938b-bde3ee3ec20c", + "label": "NameNode Heap Usage (Daily)", + "definitionId": 54, + "source": { + "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py", + "type": "SCRIPT", + "parameters": [{ + "display_name": "Whether active and stanby NameNodes metrics should be merged", + "name": "mergeHaMetrics", + "visibility": "HIDDEN", + "value": "false", + "type": "STRING", + "description": "Whether active and stanby NameNodes metrics should be merged." + }, + { + "display_name": "Time interval in minutes", + "name": "interval", + "visibility": "HIDDEN", + "value": 1440.0, + "type": "NUMERIC", + "description": "Time interval in minutes." + }, + { + "display_name": "AMS application id", + "name": "appId", + "visibility": "HIDDEN", + "value": "NAMENODE", + "type": "STRING", + "description": "The application id used to retrieve the metric." + }, + { + "display_name": "Metric Name", + "name": "metricName", + "visibility": "HIDDEN", + "value": "jvm.JvmMetrics.MemHeapUsedM", + "type": "STRING", + "description": "The metric to monitor." + }, + { + "display_name": "Growth Rate", + "name": "metric.deviation.warning.threshold", + "value": 20.0, + "threshold": "WARNING", + "units": "%", + "type": "PERCENT", + "description": "The percentage of NameNode heap usage growth." + }, + { + "display_name": "Growth Rate", + "name": "metric.deviation.critical.threshold", + "value": 50.0, + "threshold": "CRITICAL", + "units": "%", + "type": "PERCENT", + "description": "The percentage of NameNode heap usage growth." + }, + { + "display_name": "Metric Units", + "name": "metric.units", + "visibility": "HIDDEN", + "value": "MB", + "type": "STRING", + "description": "The units that the metric data points are reported in." + }, + { + "display_name": "Minimum Heap", + "name": "minimumValue", + "value": 100.0, + "units": "MB", + "type": "NUMERIC", + "description": "The minimum heap increase in a day." + } + ] + }, + "serviceName": "HDFS", + "scope": "ANY", + "enabled": true, + "description": "This service-level alert is triggered if the NameNode heap usage deviation has grown beyond the specified threshold within a day period." + }, + { + "ignore_host": false, + "name": "kafka_broker_process", + "componentName": "KAFKA_BROKER", + "interval": 1, + "clusterId": 2, + "uuid": "a240cb65-de3d-421c-b242-f0022939d41d", + "label": "Kafka Broker Process", + "definitionId": 55, + "source": { + "reporting": { + "warning": { + "text": "TCP OK - {0:.3f}s response on port {1}", + "value": 1.5 + }, + "ok": { + "text": "TCP OK - {0:.3f}s response on port {1}" + }, + "critical": { + "text": "Connection failed: {0} to {1}:{2}", + "value": 5.0 + } + }, + "type": "PORT", + "uri": "{{kafka-broker/listeners}}", + "default_port": 6667 + }, + "serviceName": "KAFKA", + "scope": "HOST", + "enabled": true, + "description": "This host-level alert is triggered if the Kafka Broker cannot be determined to be up." + }, + { + "ignore_host": false, + "name": "ams_metrics_monitor_process", + "componentName": "METRICS_MONITOR", + "interval": 1, + "clusterId": 2, + "uuid": "4e90992b-420d-43ce-9f58-29a420b6d45f", + "label": "Metrics Monitor Status", + "definitionId": 56, +