chukwa-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ey...@apache.org
Subject svn commit: r767803 - in /hadoop/chukwa: branches/chukwa-0.1/CHANGES.txt branches/chukwa-0.1/conf/aggregator.sql trunk/CHANGES.txt trunk/conf/aggregator.sql
Date Thu, 23 Apr 2009 05:30:02 GMT
Author: eyang
Date: Thu Apr 23 05:30:02 2009
New Revision: 767803

URL: http://svn.apache.org/viewvc?rev=767803&view=rev
Log:
CHUKWA-178. Extend aggregation by 5 minutes for cluster metrics. (Eric Yang)

Modified:
    hadoop/chukwa/branches/chukwa-0.1/CHANGES.txt
    hadoop/chukwa/branches/chukwa-0.1/conf/aggregator.sql
    hadoop/chukwa/trunk/CHANGES.txt
    hadoop/chukwa/trunk/conf/aggregator.sql

Modified: hadoop/chukwa/branches/chukwa-0.1/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/chukwa/branches/chukwa-0.1/CHANGES.txt?rev=767803&r1=767802&r2=767803&view=diff
==============================================================================
--- hadoop/chukwa/branches/chukwa-0.1/CHANGES.txt (original)
+++ hadoop/chukwa/branches/chukwa-0.1/CHANGES.txt Thu Apr 23 05:30:02 2009
@@ -36,6 +36,8 @@
 
   IMPROVEMENTS
 
+    CHUKWA-178. Extend aggregation by 5 minutes for cluster metrics. (Eric Yang)
+
     CHUKWA-176. Rearrange the parameter order for aggregator.sh. (Eric Yang)
 
     CHUKWA-174. Added test cases to test database partitioning, database aggregation, and
data loading. (Eric Yang)
@@ -107,6 +109,8 @@
 
   BUG FIXES
 
+    CHUKWA-158. Consolidate namenode address to a single configuration object. (Eric Yang)
+
     CHUKWA-168. Added watchdog for database. (Eric Yang)
 
     CHUKWA-175.  Removed error message for shutting down data processors. (Eric Yang)

Modified: hadoop/chukwa/branches/chukwa-0.1/conf/aggregator.sql
URL: http://svn.apache.org/viewvc/hadoop/chukwa/branches/chukwa-0.1/conf/aggregator.sql?rev=767803&r1=767802&r2=767803&view=diff
==============================================================================
--- hadoop/chukwa/branches/chukwa-0.1/conf/aggregator.sql (original)
+++ hadoop/chukwa/branches/chukwa-0.1/conf/aggregator.sql Thu Apr 23 05:30:02 2009
@@ -3,14 +3,14 @@
 #insert into [a] select d.Timestamp as starttime,((AvgCPUBusy * j.NumOfMachines) / (sum(j.NumOfMachines)
* 1)) as used from Digest d join HodJob j on (d.HodID = j.HodID) where d.Timestamp >= '[past_10_minutes]'
and d.Timestamp <= '[now]' group by d.Timestamp order by d.Timestamp 
 #insert into [b] select m, sum(foo.nodehours) as nodehours from (select m.MRJobID, round(avg(if(AvgCPUBusy
is null,0,AvgCPUBusy)),0) as m, count(*)*j.NumOfMachines/60 as nodehours from HodJobDigest
d join HodJob j on (d.HodID = j.HodID) join MRJob m on (m.HodID = j.HodID) where d.Timestamp
>= '[past_10_minutes]' and d.Timestamp <= '[now]' and d.Timestamp >= m.LAUNCH_TIME
and d.Timestamp <= m.FINISH_TIME group by m.MRJobID) as foo group by m; 
 #insert into [c] select if(AvgCPUBusy is null,0,AvgCPUBusy) as m, CASE WHEN MRJobName like
'PigLatin%' THEN 'Pig' WHEN MRJobName like 'streamjob%' THEN 'Streaming' WHEN MRJobName like
'%abacus%' THEN 'Abacus' ELSE 'Other' END as interface, count(*)*j.NumOfMachines/60 as nodehours,count(distinct(MRJobID))
as jobs from HodJobDigest d join HodJob j on (d.HodID = j.HodID) join MRJob m on (m.HodID
= j.HodID) where d.Timestamp >= '[past_10_minutes]' and d.Timestamp <= '[now]' and d.Timestamp
>= m.LAUNCH_TIME and d.Timestamp <= m.FINISH_TIME group by AvgCPUBusy,CASE WHEN MRJobName
like 'PigLatin%' THEN 'Pig' WHEN MRJobName like 'streamjob%' THEN 'Streaming' WHEN MRJobName
like '%abacus%' THEN 'Abacus' ELSE 'Other' END order by if(AvgCPUBusy is null,0,AvgCPUBusy)
-#insert into [cluster_hadoop_mapred] (select timestamp,[avg(hadoop_mapred_job)] from [hadoop_mapred_job]
where timestamp between '[past_10_minutes]' and '[now]' group by timestamp);
-replace into [cluster_system_metrics] (select timestamp,[avg(system_metrics)] from [system_metrics]
where timestamp between '[past_10_minutes]' and '[past_5_minutes]' group by timestamp);
+#insert into [cluster_hadoop_mapred] (select timestamp,[avg(hadoop_mapred_job)] from [hadoop_mapred_job]
where timestamp between '[past_15_minutes]' and '[now]' group by timestamp);
+replace into [cluster_system_metrics] (select timestamp,[avg(system_metrics)] from [system_metrics]
where timestamp between '[past_15_minutes]' and '[past_5_minutes]' group by timestamp);
 set @ph='',@p_block_reports_num_ops:=0,@p_block_verification_failures:=0,@p_blocks_read:=0,@p_blocks_removed:=0,@p_blocks_replicated:=0,@p_blocks_verified:=0,@p_blocks_written:=0,@p_bytes_read:=0,@p_bytes_written:=0,@p_copy_block_op_num_ops:=0,@p_heart_beats_num_ops:=0,@p_read_block_op_num_ops:=0,@p_read_metadata_op_num_ops:=0,@p_reads_from_local_client:=0,@p_reads_from_remote_client:=0,@p_replace_block_op_num_ops:=0,@p_write_block_op_num_ops:=0,@p_writes_from_local_client:=0,@p_writes_from_remote_client:=0;
 replace into [dfs_throughput] (select timestamp,count(host),avg(block_reports_avg_time),sum(block_reports_num_ops),sum(block_verification_failures),sum(blocks_read),sum(blocks_removed),sum(blocks_replicated),sum(blocks_verified),sum(blocks_written),sum(bytes_read),sum(bytes_written),avg(copy_block_op_avg_time),sum(copy_block_op_num_ops),avg(heart_beats_avg_time),sum(heart_beats_num_ops),avg(read_block_op_avg_time),sum(read_block_op_num_ops),avg(read_metadata_op_avg_time),sum(read_metadata_op_num_ops),sum(reads_from_local_client),sum(reads_from_remote_client),avg(replace_block_op_avg_time),sum(replace_block_op_num_ops),count(session_id),avg(write_block_op_avg_time),sum(write_block_op_num_ops),sum(writes_from_local_client),sum(writes_from_remote_client)
from (select timestamp,host,block_reports_avg_time,case host when @ph then block_reports_num_ops-@p_block_reports_num_ops
else 0 end as block_reports_num_ops, case host when @ph then block_verification_failures-@p_block_verific
 ation_failures else 0 end as block_verification_failures,case host when @ph then blocks_read-@p_blocks_read
else 0 end as blocks_read,case host when @ph then blocks_removed-@p_blocks_removed else 0
end as blocks_removed,case host when @ph then blocks_replicated-@p_blocks_replicated else
0 end as blocks_replicated,case host when @ph then blocks_verified-@p_blocks_verified else
0 end as blocks_verified,case host when @ph then blocks_written-@p_blocks_written else 0 end
as blocks_written,case host when @ph then bytes_read-@p_bytes_read else 0 end as bytes_read,case
host when @ph then bytes_written-@p_bytes_written else 0 end as bytes_written,copy_block_op_avg_time,case
host when @ph then copy_block_op_num_ops-@p_copy_block_op_num_ops else 0 end as copy_block_op_num_ops,heart_beats_avg_time,case
host when @ph then heart_beats_num_ops-@p_heart_beats_num_ops else 0 end as heart_beats_num_ops,read_block_op_avg_time,case
host when @ph then read_block_op_num_ops-@p_read_block_op_num_
 ops else 0 end as read_block_op_num_ops,read_metadata_op_avg_time,case host when @ph then
read_metadata_op_num_ops-@p_read_metadata_op_num_ops else 0 end as read_metadata_op_num_ops,case
host when @ph then reads_from_local_client-@p_reads_from_local_client else 0 end as reads_from_local_client,case
host when @ph then reads_from_remote_client-@p_reads_from_remote_client else 0 end as reads_from_remote_client,replace_block_op_avg_time,case
host when @ph then replace_block_op_num_ops-@p_replace_block_op_num_ops else 0 end as replace_block_op_num_ops,session_id,write_block_op_avg_time,case
host when @ph then write_block_op_num_ops-@p_write_block_op_num_ops else 0 end as write_block_op_num_ops,case
host when @ph then writes_from_local_client-@p_writes_from_local_client else 0 end as writes_from_local_client,case
host when @ph then writes_from_remote_client-@p_writes_from_remote_client else 0 end as writes_from_remote_client,@ph:=host,@p_block_reports_num_ops:=block_reports_num_op
 s,@p_block_verification_failures:=block_verification_failures,@p_blocks_read:=blocks_read,@p_blocks_removed:=blocks_removed,@p_blocks_replicated:=blocks_replicated,@p_blocks_verified:=blocks_verified,@p_blocks_written:=blocks_written,@p_bytes_read:=bytes_read,@p_bytes_written:=bytes_written,@p_copy_block_op_num_ops:=copy_block_op_num_ops,@p_heart_beats_num_ops:=heart_beats_num_ops,@p_read_block_op_num_ops:=read_block_op_num_ops,@p_read_metadata_op_num_ops:=read_metadata_op_num_ops,@p_reads_from_local_client:=reads_from_local_client,@p_reads_from_remote_client:=reads_from_remote_client,@p_replace_block_op_num_ops:=replace_block_op_num_ops,@p_write_block_op_num_ops:=write_block_op_num_ops,@p_writes_from_local_client:=writes_from_local_client,@p_writes_from_remote_client:=writes_from_remote_client
from [dfs_datanode] where timestamp between '[past_15_minutes]' and '[past_5_minutes]' group
by host,timestamp) as a where timestamp!='[past_15_minutes]' group by timestamp);
-replace into [cluster_disk] (select a.timestamp,a.mount,a.used,a.available,a.used_percent
from (select from_unixtime(unix_timestamp(timestamp)-unix_timestamp(timestamp)%60)as timestamp,mount,avg(used)
as used,avg(available) as available,avg(used_percent) as used_percent from [disk] where timestamp
between '[past_10_minutes]' and '[past_5_minutes]' group by timestamp,mount) as a group by
a.timestamp, a.mount);
-#replace delayed into [hod_job_digest] (select timestamp,d.hodid,d.userid,[avg(system_metrics)]
from (select a.HodID,b.host as machine,a.userid,a.starttime,a.endtime from [HodJob] a join
[hod_machine] b on (a.HodID = b.HodID) where endtime between '[past_10_minutes]' and '[past_5_minutes]')
as d,[system_metrics] where timestamp between d.starttime and d.endtime and host=d.machine
group by hodid,timestamp);
-replace into [cluster_hadoop_rpc] (select timestamp, count(host), avg(rpc_processing_time_avg_time),
sum(rpc_processing_time_num_ops), avg(rpc_queue_time_avg_time), sum(rpc_queue_time_num_ops),
avg(get_build_version_avg_time), sum(get_build_version_num_ops), avg(get_job_counters_avg_time),
sum(get_job_counters_num_ops), avg(get_job_profile_avg_time), sum(get_job_profile_num_ops),
avg(get_job_status_avg_time), sum(get_job_status_num_ops), avg(get_new_job_id_avg_time), sum(get_new_job_id_num_ops),
avg(get_protocol_version_avg_time), sum(get_protocol_version_num_ops), avg(get_system_dir_avg_time),
sum(get_system_dir_num_ops), avg(get_task_completion_events_avg_time), sum(get_task_completion_events_num_ops),
avg(get_task_diagnostics_avg_time), sum(get_task_diagnostics_num_ops), avg(heartbeat_avg_time),
sum(heartbeat_num_ops), avg(killJob_avg_time), sum(killJob_num_ops), avg(submit_job_avg_time),
sum(submit_job_num_ops) from [hadoop_rpc] where timestamp between '[past_10_minutes]
 ' and '[past_5_minutes]' group by timestamp);
-#replace into [user_util] (select timestamp, j.UserID as user, sum(j.NumOfMachines) as node_total,
sum(cpu_idle_pcnt*j.NumOfMachines) as cpu_unused, sum((cpu_user_pcnt+cpu_system_pcnt)*j.NumOfMachines)
as cpu_used, avg(cpu_user_pcnt+cpu_system_pcnt) as cpu_used_pcnt, sum((100-(sda_busy_pcnt+sdb_busy_pcnt+sdc_busy_pcnt+sdd_busy_pcnt)/4)*j.NumOfMachines)
as disk_unused, sum(((sda_busy_pcnt+sdb_busy_pcnt+sdc_busy_pcnt+sdd_busy_pcnt)/4)*j.NumOfMachines)
as disk_used, avg(((sda_busy_pcnt+sdb_busy_pcnt+sdc_busy_pcnt+sdd_busy_pcnt)/4)) as disk_used_pcnt,
sum((100-eth0_busy_pcnt)*j.NumOfMachines) as network_unused, sum(eth0_busy_pcnt*j.NumOfMachines)
as network_used, avg(eth0_busy_pcnt) as network_used_pcnt, sum((100-mem_used_pcnt)*j.NumOfMachines)
as memory_unused, sum(mem_used_pcnt*j.NumOfMachines) as memory_used, avg(mem_used_pcnt) as
memory_used_pcnt from [hod_job_digest] d,[HodJob] j where (d.HodID = j.HodID) and Timestamp
between '[past_10_minutes]' and '[past_5_minutes]' grou
 p by j.UserID);
+replace into [cluster_disk] (select a.timestamp,a.mount,a.used,a.available,a.used_percent
from (select from_unixtime(unix_timestamp(timestamp)-unix_timestamp(timestamp)%60)as timestamp,mount,avg(used)
as used,avg(available) as available,avg(used_percent) as used_percent from [disk] where timestamp
between '[past_15_minutes]' and '[past_5_minutes]' group by timestamp,mount) as a group by
a.timestamp, a.mount);
+#replace delayed into [hod_job_digest] (select timestamp,d.hodid,d.userid,[avg(system_metrics)]
from (select a.HodID,b.host as machine,a.userid,a.starttime,a.endtime from [HodJob] a join
[hod_machine] b on (a.HodID = b.HodID) where endtime between '[past_15_minutes]' and '[past_5_minutes]')
as d,[system_metrics] where timestamp between d.starttime and d.endtime and host=d.machine
group by hodid,timestamp);
+replace into [cluster_hadoop_rpc] (select timestamp, count(host), avg(rpc_processing_time_avg_time),
sum(rpc_processing_time_num_ops), avg(rpc_queue_time_avg_time), sum(rpc_queue_time_num_ops),
avg(get_build_version_avg_time), sum(get_build_version_num_ops), avg(get_job_counters_avg_time),
sum(get_job_counters_num_ops), avg(get_job_profile_avg_time), sum(get_job_profile_num_ops),
avg(get_job_status_avg_time), sum(get_job_status_num_ops), avg(get_new_job_id_avg_time), sum(get_new_job_id_num_ops),
avg(get_protocol_version_avg_time), sum(get_protocol_version_num_ops), avg(get_system_dir_avg_time),
sum(get_system_dir_num_ops), avg(get_task_completion_events_avg_time), sum(get_task_completion_events_num_ops),
avg(get_task_diagnostics_avg_time), sum(get_task_diagnostics_num_ops), avg(heartbeat_avg_time),
sum(heartbeat_num_ops), avg(killJob_avg_time), sum(killJob_num_ops), avg(submit_job_avg_time),
sum(submit_job_num_ops) from [hadoop_rpc] where timestamp between '[past_15_minutes]
 ' and '[past_5_minutes]' group by timestamp);
+#replace into [user_util] (select timestamp, j.UserID as user, sum(j.NumOfMachines) as node_total,
sum(cpu_idle_pcnt*j.NumOfMachines) as cpu_unused, sum((cpu_user_pcnt+cpu_system_pcnt)*j.NumOfMachines)
as cpu_used, avg(cpu_user_pcnt+cpu_system_pcnt) as cpu_used_pcnt, sum((100-(sda_busy_pcnt+sdb_busy_pcnt+sdc_busy_pcnt+sdd_busy_pcnt)/4)*j.NumOfMachines)
as disk_unused, sum(((sda_busy_pcnt+sdb_busy_pcnt+sdc_busy_pcnt+sdd_busy_pcnt)/4)*j.NumOfMachines)
as disk_used, avg(((sda_busy_pcnt+sdb_busy_pcnt+sdc_busy_pcnt+sdd_busy_pcnt)/4)) as disk_used_pcnt,
sum((100-eth0_busy_pcnt)*j.NumOfMachines) as network_unused, sum(eth0_busy_pcnt*j.NumOfMachines)
as network_used, avg(eth0_busy_pcnt) as network_used_pcnt, sum((100-mem_used_pcnt)*j.NumOfMachines)
as memory_unused, sum(mem_used_pcnt*j.NumOfMachines) as memory_used, avg(mem_used_pcnt) as
memory_used_pcnt from [hod_job_digest] d,[HodJob] j where (d.HodID = j.HodID) and Timestamp
between '[past_15_minutes]' and '[past_5_minutes]' grou
 p by j.UserID);
 replace into [util] (select [hdfs_usage].timestamp as timestamp, mr_usage.user, queue, sum(bytes)
as bytes, sum(slot_time)/3600 as slot_hours from [hdfs_usage], (select job.finish_time as
timestamp,job.user,queue,sum(([mr_task].finish_time-[mr_task].start_time)*[mr_task].attempts)
as slot_time from [mr_task],  (select job_id,user,queue,launch_time,finish_time from [mr_job]
where finish_time between '[past_20_minutes]' and '[now]') as job  where [mr_task].job_id=job.job_id
group by floor(unix_timestamp(timestamp)/600),user,queue) as mr_usage where mr_usage.user=[hdfs_usage].user
and [hdfs_usage].timestamp between '[past_20_minutes]' and '[now]' group by floor(unix_timestamp([hdfs_usage].timestamp)/600),
user);
 #
 # Down sample metrics for charts

Modified: hadoop/chukwa/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/chukwa/trunk/CHANGES.txt?rev=767803&r1=767802&r2=767803&view=diff
==============================================================================
--- hadoop/chukwa/trunk/CHANGES.txt (original)
+++ hadoop/chukwa/trunk/CHANGES.txt Thu Apr 23 05:30:02 2009
@@ -38,6 +38,8 @@
 
   IMPROVEMENTS
 
+    CHUKWA-178. Extend aggregation by 5 minutes for cluster metrics. (Eric Yang)
+
     CHUKWA-176. Rearrange the parameter order for aggregator.sh. (Eric Yang)
 
     CHUKWA-173. Parameterize configuration, and enable substitution at build time. (Jerome
Boulon via Eric Yang)
@@ -113,6 +115,8 @@
 
   BUG FIXES
 
+    CHUKWA-158. Consolidate namenode address to a single configuration object. (Eric Yang)
+
     CHUKWA-168. Added watchdog for database. (Eric Yang)
 
     CHUKWA-175.  Removed error message for shutting down data processors. (Eric Yang)

Modified: hadoop/chukwa/trunk/conf/aggregator.sql
URL: http://svn.apache.org/viewvc/hadoop/chukwa/trunk/conf/aggregator.sql?rev=767803&r1=767802&r2=767803&view=diff
==============================================================================
--- hadoop/chukwa/trunk/conf/aggregator.sql (original)
+++ hadoop/chukwa/trunk/conf/aggregator.sql Thu Apr 23 05:30:02 2009
@@ -3,14 +3,14 @@
 #insert into [a] select d.Timestamp as starttime,((AvgCPUBusy * j.NumOfMachines) / (sum(j.NumOfMachines)
* 1)) as used from Digest d join HodJob j on (d.HodID = j.HodID) where d.Timestamp >= '[past_10_minutes]'
and d.Timestamp <= '[now]' group by d.Timestamp order by d.Timestamp 
 #insert into [b] select m, sum(foo.nodehours) as nodehours from (select m.MRJobID, round(avg(if(AvgCPUBusy
is null,0,AvgCPUBusy)),0) as m, count(*)*j.NumOfMachines/60 as nodehours from HodJobDigest
d join HodJob j on (d.HodID = j.HodID) join MRJob m on (m.HodID = j.HodID) where d.Timestamp
>= '[past_10_minutes]' and d.Timestamp <= '[now]' and d.Timestamp >= m.LAUNCH_TIME
and d.Timestamp <= m.FINISH_TIME group by m.MRJobID) as foo group by m; 
 #insert into [c] select if(AvgCPUBusy is null,0,AvgCPUBusy) as m, CASE WHEN MRJobName like
'PigLatin%' THEN 'Pig' WHEN MRJobName like 'streamjob%' THEN 'Streaming' WHEN MRJobName like
'%abacus%' THEN 'Abacus' ELSE 'Other' END as interface, count(*)*j.NumOfMachines/60 as nodehours,count(distinct(MRJobID))
as jobs from HodJobDigest d join HodJob j on (d.HodID = j.HodID) join MRJob m on (m.HodID
= j.HodID) where d.Timestamp >= '[past_10_minutes]' and d.Timestamp <= '[now]' and d.Timestamp
>= m.LAUNCH_TIME and d.Timestamp <= m.FINISH_TIME group by AvgCPUBusy,CASE WHEN MRJobName
like 'PigLatin%' THEN 'Pig' WHEN MRJobName like 'streamjob%' THEN 'Streaming' WHEN MRJobName
like '%abacus%' THEN 'Abacus' ELSE 'Other' END order by if(AvgCPUBusy is null,0,AvgCPUBusy)
-#insert into [cluster_hadoop_mapred] (select timestamp,[avg(hadoop_mapred_job)] from [hadoop_mapred_job]
where timestamp between '[past_10_minutes]' and '[now]' group by timestamp);
-replace into [cluster_system_metrics] (select timestamp,[avg(system_metrics)] from [system_metrics]
where timestamp between '[past_10_minutes]' and '[past_5_minutes]' group by timestamp);
+#insert into [cluster_hadoop_mapred] (select timestamp,[avg(hadoop_mapred_job)] from [hadoop_mapred_job]
where timestamp between '[past_15_minutes]' and '[now]' group by timestamp);
+replace into [cluster_system_metrics] (select timestamp,[avg(system_metrics)] from [system_metrics]
where timestamp between '[past_15_minutes]' and '[past_5_minutes]' group by timestamp);
 set @ph='',@p_block_reports_num_ops:=0,@p_block_verification_failures:=0,@p_blocks_read:=0,@p_blocks_removed:=0,@p_blocks_replicated:=0,@p_blocks_verified:=0,@p_blocks_written:=0,@p_bytes_read:=0,@p_bytes_written:=0,@p_copy_block_op_num_ops:=0,@p_heart_beats_num_ops:=0,@p_read_block_op_num_ops:=0,@p_read_metadata_op_num_ops:=0,@p_reads_from_local_client:=0,@p_reads_from_remote_client:=0,@p_replace_block_op_num_ops:=0,@p_write_block_op_num_ops:=0,@p_writes_from_local_client:=0,@p_writes_from_remote_client:=0;
 replace into [dfs_throughput] (select timestamp,count(host),avg(block_reports_avg_time),sum(block_reports_num_ops),sum(block_verification_failures),sum(blocks_read),sum(blocks_removed),sum(blocks_replicated),sum(blocks_verified),sum(blocks_written),sum(bytes_read),sum(bytes_written),avg(copy_block_op_avg_time),sum(copy_block_op_num_ops),avg(heart_beats_avg_time),sum(heart_beats_num_ops),avg(read_block_op_avg_time),sum(read_block_op_num_ops),avg(read_metadata_op_avg_time),sum(read_metadata_op_num_ops),sum(reads_from_local_client),sum(reads_from_remote_client),avg(replace_block_op_avg_time),sum(replace_block_op_num_ops),count(session_id),avg(write_block_op_avg_time),sum(write_block_op_num_ops),sum(writes_from_local_client),sum(writes_from_remote_client)
from (select timestamp,host,block_reports_avg_time,case host when @ph then block_reports_num_ops-@p_block_reports_num_ops
else 0 end as block_reports_num_ops, case host when @ph then block_verification_failures-@p_block_verific
 ation_failures else 0 end as block_verification_failures,case host when @ph then blocks_read-@p_blocks_read
else 0 end as blocks_read,case host when @ph then blocks_removed-@p_blocks_removed else 0
end as blocks_removed,case host when @ph then blocks_replicated-@p_blocks_replicated else
0 end as blocks_replicated,case host when @ph then blocks_verified-@p_blocks_verified else
0 end as blocks_verified,case host when @ph then blocks_written-@p_blocks_written else 0 end
as blocks_written,case host when @ph then bytes_read-@p_bytes_read else 0 end as bytes_read,case
host when @ph then bytes_written-@p_bytes_written else 0 end as bytes_written,copy_block_op_avg_time,case
host when @ph then copy_block_op_num_ops-@p_copy_block_op_num_ops else 0 end as copy_block_op_num_ops,heart_beats_avg_time,case
host when @ph then heart_beats_num_ops-@p_heart_beats_num_ops else 0 end as heart_beats_num_ops,read_block_op_avg_time,case
host when @ph then read_block_op_num_ops-@p_read_block_op_num_
 ops else 0 end as read_block_op_num_ops,read_metadata_op_avg_time,case host when @ph then
read_metadata_op_num_ops-@p_read_metadata_op_num_ops else 0 end as read_metadata_op_num_ops,case
host when @ph then reads_from_local_client-@p_reads_from_local_client else 0 end as reads_from_local_client,case
host when @ph then reads_from_remote_client-@p_reads_from_remote_client else 0 end as reads_from_remote_client,replace_block_op_avg_time,case
host when @ph then replace_block_op_num_ops-@p_replace_block_op_num_ops else 0 end as replace_block_op_num_ops,session_id,write_block_op_avg_time,case
host when @ph then write_block_op_num_ops-@p_write_block_op_num_ops else 0 end as write_block_op_num_ops,case
host when @ph then writes_from_local_client-@p_writes_from_local_client else 0 end as writes_from_local_client,case
host when @ph then writes_from_remote_client-@p_writes_from_remote_client else 0 end as writes_from_remote_client,@ph:=host,@p_block_reports_num_ops:=block_reports_num_op
 s,@p_block_verification_failures:=block_verification_failures,@p_blocks_read:=blocks_read,@p_blocks_removed:=blocks_removed,@p_blocks_replicated:=blocks_replicated,@p_blocks_verified:=blocks_verified,@p_blocks_written:=blocks_written,@p_bytes_read:=bytes_read,@p_bytes_written:=bytes_written,@p_copy_block_op_num_ops:=copy_block_op_num_ops,@p_heart_beats_num_ops:=heart_beats_num_ops,@p_read_block_op_num_ops:=read_block_op_num_ops,@p_read_metadata_op_num_ops:=read_metadata_op_num_ops,@p_reads_from_local_client:=reads_from_local_client,@p_reads_from_remote_client:=reads_from_remote_client,@p_replace_block_op_num_ops:=replace_block_op_num_ops,@p_write_block_op_num_ops:=write_block_op_num_ops,@p_writes_from_local_client:=writes_from_local_client,@p_writes_from_remote_client:=writes_from_remote_client
from [dfs_datanode] where timestamp between '[past_15_minutes]' and '[past_5_minutes]' group
by host,timestamp) as a where timestamp!='[past_15_minutes]' group by timestamp);
-replace into [cluster_disk] (select a.timestamp,a.mount,a.used,a.available,a.used_percent
from (select from_unixtime(unix_timestamp(timestamp)-unix_timestamp(timestamp)%60)as timestamp,mount,avg(used)
as used,avg(available) as available,avg(used_percent) as used_percent from [disk] where timestamp
between '[past_10_minutes]' and '[past_5_minutes]' group by timestamp,mount) as a group by
a.timestamp, a.mount);
-#replace delayed into [hod_job_digest] (select timestamp,d.hodid,d.userid,[avg(system_metrics)]
from (select a.HodID,b.host as machine,a.userid,a.starttime,a.endtime from [HodJob] a join
[hod_machine] b on (a.HodID = b.HodID) where endtime between '[past_10_minutes]' and '[past_5_minutes]')
as d,[system_metrics] where timestamp between d.starttime and d.endtime and host=d.machine
group by hodid,timestamp);
-replace into [cluster_hadoop_rpc] (select timestamp, count(host), avg(rpc_processing_time_avg_time),
sum(rpc_processing_time_num_ops), avg(rpc_queue_time_avg_time), sum(rpc_queue_time_num_ops),
avg(get_build_version_avg_time), sum(get_build_version_num_ops), avg(get_job_counters_avg_time),
sum(get_job_counters_num_ops), avg(get_job_profile_avg_time), sum(get_job_profile_num_ops),
avg(get_job_status_avg_time), sum(get_job_status_num_ops), avg(get_new_job_id_avg_time), sum(get_new_job_id_num_ops),
avg(get_protocol_version_avg_time), sum(get_protocol_version_num_ops), avg(get_system_dir_avg_time),
sum(get_system_dir_num_ops), avg(get_task_completion_events_avg_time), sum(get_task_completion_events_num_ops),
avg(get_task_diagnostics_avg_time), sum(get_task_diagnostics_num_ops), avg(heartbeat_avg_time),
sum(heartbeat_num_ops), avg(killJob_avg_time), sum(killJob_num_ops), avg(submit_job_avg_time),
sum(submit_job_num_ops) from [hadoop_rpc] where timestamp between '[past_10_minutes]
 ' and '[past_5_minutes]' group by timestamp);
-#replace into [user_util] (select timestamp, j.UserID as user, sum(j.NumOfMachines) as node_total,
sum(cpu_idle_pcnt*j.NumOfMachines) as cpu_unused, sum((cpu_user_pcnt+cpu_system_pcnt)*j.NumOfMachines)
as cpu_used, avg(cpu_user_pcnt+cpu_system_pcnt) as cpu_used_pcnt, sum((100-(sda_busy_pcnt+sdb_busy_pcnt+sdc_busy_pcnt+sdd_busy_pcnt)/4)*j.NumOfMachines)
as disk_unused, sum(((sda_busy_pcnt+sdb_busy_pcnt+sdc_busy_pcnt+sdd_busy_pcnt)/4)*j.NumOfMachines)
as disk_used, avg(((sda_busy_pcnt+sdb_busy_pcnt+sdc_busy_pcnt+sdd_busy_pcnt)/4)) as disk_used_pcnt,
sum((100-eth0_busy_pcnt)*j.NumOfMachines) as network_unused, sum(eth0_busy_pcnt*j.NumOfMachines)
as network_used, avg(eth0_busy_pcnt) as network_used_pcnt, sum((100-mem_used_pcnt)*j.NumOfMachines)
as memory_unused, sum(mem_used_pcnt*j.NumOfMachines) as memory_used, avg(mem_used_pcnt) as
memory_used_pcnt from [hod_job_digest] d,[HodJob] j where (d.HodID = j.HodID) and Timestamp
between '[past_10_minutes]' and '[past_5_minutes]' grou
 p by j.UserID);
+replace into [cluster_disk] (select a.timestamp,a.mount,a.used,a.available,a.used_percent
from (select from_unixtime(unix_timestamp(timestamp)-unix_timestamp(timestamp)%60)as timestamp,mount,avg(used)
as used,avg(available) as available,avg(used_percent) as used_percent from [disk] where timestamp
between '[past_15_minutes]' and '[past_5_minutes]' group by timestamp,mount) as a group by
a.timestamp, a.mount);
+#replace delayed into [hod_job_digest] (select timestamp,d.hodid,d.userid,[avg(system_metrics)]
from (select a.HodID,b.host as machine,a.userid,a.starttime,a.endtime from [HodJob] a join
[hod_machine] b on (a.HodID = b.HodID) where endtime between '[past_15_minutes]' and '[past_5_minutes]')
as d,[system_metrics] where timestamp between d.starttime and d.endtime and host=d.machine
group by hodid,timestamp);
+replace into [cluster_hadoop_rpc] (select timestamp, count(host), avg(rpc_processing_time_avg_time),
sum(rpc_processing_time_num_ops), avg(rpc_queue_time_avg_time), sum(rpc_queue_time_num_ops),
avg(get_build_version_avg_time), sum(get_build_version_num_ops), avg(get_job_counters_avg_time),
sum(get_job_counters_num_ops), avg(get_job_profile_avg_time), sum(get_job_profile_num_ops),
avg(get_job_status_avg_time), sum(get_job_status_num_ops), avg(get_new_job_id_avg_time), sum(get_new_job_id_num_ops),
avg(get_protocol_version_avg_time), sum(get_protocol_version_num_ops), avg(get_system_dir_avg_time),
sum(get_system_dir_num_ops), avg(get_task_completion_events_avg_time), sum(get_task_completion_events_num_ops),
avg(get_task_diagnostics_avg_time), sum(get_task_diagnostics_num_ops), avg(heartbeat_avg_time),
sum(heartbeat_num_ops), avg(killJob_avg_time), sum(killJob_num_ops), avg(submit_job_avg_time),
sum(submit_job_num_ops) from [hadoop_rpc] where timestamp between '[past_15_minutes]
 ' and '[past_5_minutes]' group by timestamp);
+#replace into [user_util] (select timestamp, j.UserID as user, sum(j.NumOfMachines) as node_total,
sum(cpu_idle_pcnt*j.NumOfMachines) as cpu_unused, sum((cpu_user_pcnt+cpu_system_pcnt)*j.NumOfMachines)
as cpu_used, avg(cpu_user_pcnt+cpu_system_pcnt) as cpu_used_pcnt, sum((100-(sda_busy_pcnt+sdb_busy_pcnt+sdc_busy_pcnt+sdd_busy_pcnt)/4)*j.NumOfMachines)
as disk_unused, sum(((sda_busy_pcnt+sdb_busy_pcnt+sdc_busy_pcnt+sdd_busy_pcnt)/4)*j.NumOfMachines)
as disk_used, avg(((sda_busy_pcnt+sdb_busy_pcnt+sdc_busy_pcnt+sdd_busy_pcnt)/4)) as disk_used_pcnt,
sum((100-eth0_busy_pcnt)*j.NumOfMachines) as network_unused, sum(eth0_busy_pcnt*j.NumOfMachines)
as network_used, avg(eth0_busy_pcnt) as network_used_pcnt, sum((100-mem_used_pcnt)*j.NumOfMachines)
as memory_unused, sum(mem_used_pcnt*j.NumOfMachines) as memory_used, avg(mem_used_pcnt) as
memory_used_pcnt from [hod_job_digest] d,[HodJob] j where (d.HodID = j.HodID) and Timestamp
between '[past_15_minutes]' and '[past_5_minutes]' grou
 p by j.UserID);
 replace into [util] (select [hdfs_usage].timestamp as timestamp, mr_usage.user, queue, sum(bytes)
as bytes, sum(slot_time)/3600 as slot_hours from [hdfs_usage], (select job.finish_time as
timestamp,job.user,queue,sum(([mr_task].finish_time-[mr_task].start_time)*[mr_task].attempts)
as slot_time from [mr_task],  (select job_id,user,queue,launch_time,finish_time from [mr_job]
where finish_time between '[past_20_minutes]' and '[now]') as job  where [mr_task].job_id=job.job_id
group by floor(unix_timestamp(timestamp)/600),user,queue) as mr_usage where mr_usage.user=[hdfs_usage].user
and [hdfs_usage].timestamp between '[past_20_minutes]' and '[now]' group by floor(unix_timestamp([hdfs_usage].timestamp)/600),
user);
 #
 # Down sample metrics for charts



Mime
View raw message