incubator-ambari-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From swa...@apache.org
Subject svn commit: r1482589 - in /incubator/ambari/trunk: ./ ambari-agent/src/main/puppet/modules/hdp-nagios/files/ ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/ ambari-agent/src/main/puppet/modules/hdp-nagios/templates/ ambari-agent/src/main/pup...
Date Tue, 14 May 2013 20:56:42 GMT
Author: swagle
Date: Tue May 14 20:56:41 2013
New Revision: 1482589

URL: http://svn.apache.org/r1482589
Log:
AMBARI-2133. Add Nagios alerts for Hadoop 2.0 in Ambari. (swagle)

Modified:
    incubator/ambari/trunk/CHANGES.txt
    incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_rpcq_latency.php
    incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_webui.sh
    incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/params.pp
    incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-servicegroups.cfg.erb
    incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb
    incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp/manifests/init.pp
    incubator/ambari/trunk/ambari-server/src/main/resources/stacks/HDP/2.0.1/services/GANGLIA/metainfo.xml

Modified: incubator/ambari/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/ambari/trunk/CHANGES.txt?rev=1482589&r1=1482588&r2=1482589&view=diff
==============================================================================
--- incubator/ambari/trunk/CHANGES.txt (original)
+++ incubator/ambari/trunk/CHANGES.txt Tue May 14 20:56:41 2013
@@ -12,6 +12,8 @@ Trunk (unreleased changes):
 
  NEW FEATURES
 
+ AMBARI-2133. Add Nagios alerts for Hadoop 2.0 in Ambari. (swagle)
+
  AMBARI-2123. Allow the user to specify a non-root ssh user in Install Options.
  (yusaku)
 

Modified: incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_rpcq_latency.php
URL: http://svn.apache.org/viewvc/incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_rpcq_latency.php?rev=1482589&r1=1482588&r2=1482589&view=diff
==============================================================================
--- incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_rpcq_latency.php
(original)
+++ incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_rpcq_latency.php
Tue May 14 20:56:41 2013
@@ -21,7 +21,7 @@
  * It checks the rpc wait time in the queue, RpcQueueTime_avg_time
  * check_rpcq_latency -h hostaddress -p port -t ServiceName -w 1 -c 1
  * Warning and Critical values are in seconds
- * Service Name = JobTracker, NameNode
+ * Service Name = JobTracker, NameNode, JobHistoryServer
  */
 
   $options = getopt ("h:p:w:c:n:");
@@ -62,6 +62,6 @@
 
   /* print usage */
   function usage () {
-    echo "Usage: $0 -h <host> -p port -n <JobTracker/NameNode> -w <warn_in_sec>
-c <crit_in_sec>\n";
+    echo "Usage: $0 -h <host> -p port -n <JobTracker/NameNode/JobHistoryServer>
-w <warn_in_sec> -c <crit_in_sec>\n";
   }
 ?>

Modified: incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_webui.sh
URL: http://svn.apache.org/viewvc/incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_webui.sh?rev=1482589&r1=1482588&r2=1482589&view=diff
==============================================================================
--- incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_webui.sh
(original)
+++ incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_webui.sh
Tue May 14 20:56:41 2013
@@ -64,7 +64,21 @@ hbase)
       exit 1;
     fi
     ;;
-*) echo "UNKNOWN: Invalid service name [$service], valid options [jobtracker|jobhistory|hbase|namenode]"
+resorcemanager)
+    rmweburl="http://$host:$port/cluster"
+    if [[ `checkurl "$rmweburl"` -ne 0 ]]; then 
+      echo "WARNING: ResourceManager web UI not accessible : $rmweburl";
+      exit 1;
+    fi
+    ;;
+historyserver2)
+    hsweburl="http://$host:$port/jobhistory"
+    if [[ `checkurl "$hsweburl"` -ne 0 ]]; then 
+      echo "WARNING: HistoryServer2 web UI not accessible : $hsweburl";
+      exit 1;
+    fi
+    ;;
+*) echo "UNKNOWN: Invalid service name [$service], valid options [jobtracker|jobhistory|hbase|namenode|resorcemanager|historyserver2]"
    exit 3
    ;;
 esac

Modified: incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/params.pp
URL: http://svn.apache.org/viewvc/incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/params.pp?rev=1482589&r1=1482588&r2=1482589&view=diff
==============================================================================
--- incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/params.pp
(original)
+++ incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/params.pp
Tue May 14 20:56:41 2013
@@ -86,6 +86,9 @@ class hdp-nagios::params() inherits hdp:
     region-servers => {host_member_info => 'hbase_rs_hosts'},
     oozie-server => {host_member_info => 'oozie_server'},
     webhcat-server => {host_member_info => 'webhcat_server_host'},
-    hue-server => {host_member_info => 'hue_server_host'}
+    hue-server => {host_member_info => 'hue_server_host'},
+    resorcemanager => {host_member_info => 'rm_host'},
+    nodemanagers => {host_member_info => 'nm_hosts'},
+    historyserver2 => {host_member_info => 'hs_host'}
   }
 }

Modified: incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-servicegroups.cfg.erb
URL: http://svn.apache.org/viewvc/incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-servicegroups.cfg.erb?rev=1482589&r1=1482588&r2=1482589&view=diff
==============================================================================
--- incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-servicegroups.cfg.erb
(original)
+++ incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-servicegroups.cfg.erb
Tue May 14 20:56:41 2013
@@ -7,6 +7,11 @@ define servicegroup {
   alias  MAPREDUCE Checks
 }
 define servicegroup {
+  servicegroup_name  YARN
+  alias  YARN Checks
+}
+
+define servicegroup {
   servicegroup_name  HBASE
   alias  HBASE Checks
 }

Modified: incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb
URL: http://svn.apache.org/viewvc/incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb?rev=1482589&r1=1482588&r2=1482589&view=diff
==============================================================================
--- incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb
(original)
+++ incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb
Tue May 14 20:56:41 2013
@@ -175,6 +175,46 @@ define service {
         max_check_attempts      4
 }
 <%end-%>
+
+<%if scope.function_hdp_nagios_members_exist('resorcemanager')-%>
+define service {
+        hostgroup_name          resorcemanager
+        use                     hadoop-service
+        service_description     GANGLIA::Ganglia Collector [gmond] process down alert for
Resource Manager
+        servicegroups           GANGLIA
+        check_command           check_tcp!<%=scope.function_hdp_template_var("ganglia_collector_rm_port")%>!-w
1 -c 1
+        normal_check_interval   0.25
+        retry_check_interval    0.25
+        max_check_attempts      4
+}
+<%end-%>
+
+<%if scope.function_hdp_nagios_members_exist('nodemanagers')-%>
+define service {
+        hostgroup_name          nodemanagers
+        use                     hadoop-service
+        service_description     GANGLIA::Ganglia Collector [gmond] process down alert for
Node Manager
+        servicegroups           GANGLIA
+        check_command           check_tcp!<%=scope.function_hdp_template_var("ganglia_collector_nm_port")%>!-w
1 -c 1
+        normal_check_interval   0.25
+        retry_check_interval    0.25
+        max_check_attempts      4
+}
+<%end-%>
+
+<%if scope.function_hdp_nagios_members_exist('historyserver2')-%>
+define service {
+        hostgroup_name          historyserver2
+        use                     hadoop-service
+        service_description     GANGLIA::Ganglia Collector [gmond] process down alert for
History Server 2
+        servicegroups           GANGLIA
+        check_command           check_tcp!<%=scope.function_hdp_template_var("ganglia_collector_hs_port")%>!-w
1 -c 1
+        normal_check_interval   0.25
+        retry_check_interval    0.25
+        max_check_attempts      4
+}
+<%end-%>
+
 <%end-%>
 
 <%if scope.function_hdp_nagios_members_exist('snamenode')-%>
@@ -344,6 +384,94 @@ define service {
 
 <%end-%>
 
+<%if scope.function_hdp_nagios_members_exist('resorcemanager')-%>
+# YARN::RESOURCEMANAGER Checks 
+define service {
+        hostgroup_name          resorcemanager
+        use                     hadoop-service
+        service_description     RESOURCEMANAGER::Resource Manager Web UI down
+        servicegroups           YARN
+        check_command           check_webui!resorcemanager!<%=scope.function_hdp_template_var("rm_port")%>
+        normal_check_interval   1
+        retry_check_interval    1
+        max_check_attempts      3
+}
+
+define service {
+        hostgroup_name          resorcemanager
+        use                     hadoop-service
+        service_description     RESOURCEMANAGER::Resource Manager CPU utilization
+        servicegroups           YARN
+        check_command           check_cpu!200%!250%
+        normal_check_interval   5
+        retry_check_interval    2 
+        max_check_attempts      5
+}
+
+define service {
+        hostgroup_name          resorcemanager
+        use                     hadoop-service
+        service_description     RESOURCEMANAGER::Resource Manager RPC latency
+        servicegroups           YARN
+        check_command           check_rpcq_latency!ResorceManager!<%=scope.function_hdp_template_var("rm_port")%>!3000!5000
+        normal_check_interval   5
+        retry_check_interval    1 
+        max_check_attempts      5
+}
+
+<% end %>
+
+<%if scope.function_hdp_nagios_members_exist('nodemanagers')-%>
+# YARN::NODEMANAGER Checks
+define service {
+        hostgroup_name          nodemanagers
+        use                     hadoop-service
+        service_description     NODEMANAGER::Node Manager process down
+        servicegroups           YARN
+        check_command           check_tcp!<%=scope.function_hdp_template_var("nm_port")%>!-w
1 -c 1
+        normal_check_interval   1
+        retry_check_interval    0.5
+        max_check_attempts      3
+}
+<% end %>
+
+<%if scope.function_hdp_nagios_members_exist('historyserver2')-%>
+# MAPREDUCE::HISTORYSERVER2 Checks
+define service {
+        hostgroup_name          historyserver2
+        use                     hadoop-service
+        service_description     HISTORYSERVER2::History Server 2 Web UI down
+        servicegroups           MAPREDUCE
+        check_command           check_webui!historyserver2!<%=scope.function_hdp_template_var("hs_port")%>
+        normal_check_interval   1
+        retry_check_interval    1
+        max_check_attempts      3
+}
+
+define service {
+        hostgroup_name          historyserver2
+        use                     hadoop-service
+        service_description     HISTORYSERVER::History Server 2 CPU utilization
+        servicegroups           MAPREDUCE
+        check_command           check_cpu!200%!250%
+        normal_check_interval   5
+        retry_check_interval    2 
+        max_check_attempts      5
+}
+
+define service {
+        hostgroup_name          historyserver2
+        use                     hadoop-service
+        service_description     HISTORYSERVER::History Server 2 RPC latency
+        servicegroups           MAPREDUCE
+        check_command           check_rpcq_latency!JobHistoryServer!<%=scope.function_hdp_template_var("hs_port")%>!3000!5000
+        normal_check_interval   5
+        retry_check_interval    1 
+        max_check_attempts      5
+}
+
+<% end %>
+
 <%if scope.function_hdp_nagios_members_exist('slaves')-%>
 # HDFS::DATANODE Checks
 define service {

Modified: incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp/manifests/init.pp
URL: http://svn.apache.org/viewvc/incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp/manifests/init.pp?rev=1482589&r1=1482588&r2=1482589&view=diff
==============================================================================
--- incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp/manifests/init.pp (original)
+++ incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp/manifests/init.pp Tue
May 14 20:56:41 2013
@@ -50,6 +50,14 @@ class hdp(
     $jtnode_port = hdp_get_port_from_url($mapred-site["mapred.job.tracker.http.address"],"50030")
     $tasktracker_port = hdp_get_port_from_url($mapred-site["mapred.task.tracker.http.address"],"50060")
     $jobhistory_port = hdp_get_port_from_url($mapred-site["mapreduce.history.server.http.address"],"51111")
+
+    $hs_port = hdp_get_port_from_url($mapred-site["mapreduce.jobhistory.webapp.address"],"19888")
+  }
+
+  if has_key($configuration, 'yarn-site') {
+    $yarn-site = $configuration['yarn-site']
+    $rm_port = hdp_get_port_from_url($yarn-site["yarn.resourcemanager.webapp.address"],"8088")
+    $nm_port = hdp_get_port_from_url($yarn-site["yarn.nodemanager.webapp.address"],"8042")
   }
 
   $hbase_master_port = hdp_default("hbase-site/hbase.master.info.port","60010")
@@ -60,6 +68,9 @@ class hdp(
   $ganglia_collector_namenode_port = hdp_default("ganglia_collector_namenode_port","8661")
   $ganglia_collector_jobtracker_port = hdp_default("ganglia_collector_jobtracker_port","8662")
   $ganglia_collector_hbase_port = hdp_default("ganglia_collector_hbase_port","8663")
+  $ganglia_collector_rm_port = hdp_default("ganglia_collector_rm_port","8664")
+  $ganglia_collector_nm_port = hdp_default("ganglia_collector_nm_port","8665")
+  $ganglia_collector_hs_port = hdp_default("ganglia_collector_hs_port","8666")
 
   $oozie_server_port = hdp_default("oozie_server_port","11000")
 

Modified: incubator/ambari/trunk/ambari-server/src/main/resources/stacks/HDP/2.0.1/services/GANGLIA/metainfo.xml
URL: http://svn.apache.org/viewvc/incubator/ambari/trunk/ambari-server/src/main/resources/stacks/HDP/2.0.1/services/GANGLIA/metainfo.xml?rev=1482589&r1=1482588&r2=1482589&view=diff
==============================================================================
--- incubator/ambari/trunk/ambari-server/src/main/resources/stacks/HDP/2.0.1/services/GANGLIA/metainfo.xml
(original)
+++ incubator/ambari/trunk/ambari-server/src/main/resources/stacks/HDP/2.0.1/services/GANGLIA/metainfo.xml
Tue May 14 20:56:41 2013
@@ -31,10 +31,6 @@
             <category>SLAVE</category>
         </component>
 
-        <component>
-            <name>MONITOR_WEBSERVER</name>
-            <category>MASTER</category>
-        </component>
     </components>
 
 </metainfo>



Mime
View raw message