hadoop-mapreduce-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From yhema...@apache.org
Subject svn commit: r802502 - in /hadoop/mapreduce/trunk: ./ src/java/org/apache/hadoop/mapred/ src/test/ src/test/mapred/org/apache/hadoop/mapred/ src/webapps/job/
Date Sun, 09 Aug 2009 09:45:32 GMT
Author: yhemanth
Date: Sun Aug  9 09:45:31 2009
New Revision: 802502

URL: http://svn.apache.org/viewvc?rev=802502&view=rev
Log:
MAPREDUCE-779. Added node health failure counts into JobTrackerStatistics. Contributed by
Sreekanth Ramakrishnan.

Modified:
    hadoop/mapreduce/trunk/CHANGES.txt
    hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java
    hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapred/JobTrackerStatistics.java
    hadoop/mapreduce/trunk/src/test/commit-tests
    hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/mapred/TestTaskTrackerBlacklisting.java
    hadoop/mapreduce/trunk/src/webapps/job/machines.jsp

Modified: hadoop/mapreduce/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/CHANGES.txt?rev=802502&r1=802501&r2=802502&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/CHANGES.txt (original)
+++ hadoop/mapreduce/trunk/CHANGES.txt Sun Aug  9 09:45:31 2009
@@ -175,6 +175,9 @@
     and org.apache.hadoop.mapred.MapFileOutputFormat to use new api.
     (Amareshwari Sriramadasu via ddas)
 
+    MAPREDUCE-779. Added node health failure counts into 
+    JobTrackerStatistics. (Sreekanth Ramakrishnan via yhemanth)
+
   BUG FIXES
     MAPREDUCE-703. Sqoop requires dependency on hsqldb in ivy.
     (Aaron Kimball via matei)

Modified: hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java?rev=802502&r1=802501&r2=802502&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java (original)
+++ hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java Sun Aug  9 09:45:31
2009
@@ -65,6 +65,7 @@
 import org.apache.hadoop.mapred.JobHistory.Listener;
 import org.apache.hadoop.mapred.JobHistory.Values;
 import org.apache.hadoop.mapred.JobStatusChangeEvent.EventType;
+import org.apache.hadoop.mapred.JobTrackerStatistics.TaskTrackerStat;
 import org.apache.hadoop.mapred.TaskTrackerStatus.TaskTrackerHealthStatus;
 import org.apache.hadoop.net.DNSToSwitchMapping;
 import org.apache.hadoop.net.NetUtils;
@@ -843,7 +844,8 @@
       if (!isHealthy) {
         fi = getFaultInfo(hostName, true);
         fi.setHealthy(isHealthy);
-        synchronized (potentiallyFaultyTrackers) {
+        updateNodeHealthFailureStatistics(hostName, fi);
+        synchronized (potentiallyFaultyTrackers) { 
           blackListTracker(hostName, reason,
               ReasonForBlackListing.NODE_UNHEALTHY);
         }
@@ -859,6 +861,34 @@
         }
       }
     }
+
+    /**
+     * Update the node health failure statistics of the given
+     * host.
+     * 
+     * We increment the count only when the host transitions
+     * from healthy -> unhealthy. 
+     * 
+     * @param hostName
+     * @param fi Fault info object for the host.
+     */
+    private void updateNodeHealthFailureStatistics(String hostName, 
+        FaultInfo fi) {
+      //Check if the node was already blacklisted due to 
+      //unhealthy reason. If so dont increment the count.
+      if (!fi.getReasonforblacklisting().contains(
+          ReasonForBlackListing.NODE_UNHEALTHY)) {
+        Set<TaskTracker> trackers = hostnameToTaskTracker.get(hostName);
+        synchronized (trackers) {
+          for (TaskTracker t : trackers) {
+            TaskTrackerStat stat = statistics.getTaskTrackerStat(
+                t.getTrackerName());
+            stat.incrHealthCheckFailed();
+          }
+        }
+      }
+    }
+    
   }
   
   /**

Modified: hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapred/JobTrackerStatistics.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapred/JobTrackerStatistics.java?rev=802502&r1=802501&r2=802502&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapred/JobTrackerStatistics.java (original)
+++ hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapred/JobTrackerStatistics.java Sun
Aug  9 09:45:31 2009
@@ -51,7 +51,7 @@
       stat.remove();
     }
   }
-
+  
   synchronized TaskTrackerStat getTaskTrackerStat(String name) {
     return ttStats.get(name);
   }
@@ -62,12 +62,17 @@
 
     final String succeededTasksKey;
     final Stat succeededTasksStat;
+    
+    final String healthCheckFailedKey;
+    final Stat healthCheckFailedStat;
 
     TaskTrackerStat(String trackerName) {
       totalTasksKey = trackerName+"-"+"totalTasks";
       totalTasksStat = collector.createStat(totalTasksKey);
       succeededTasksKey = trackerName+"-"+"succeededTasks";
       succeededTasksStat = collector.createStat(succeededTasksKey);
+      healthCheckFailedKey = trackerName + "-"+ "healthcheckfailed";
+      healthCheckFailedStat = collector.createStat(healthCheckFailedKey);
     }
 
     synchronized void incrTotalTasks() {
@@ -77,10 +82,15 @@
     synchronized void incrSucceededTasks() {
       succeededTasksStat.inc();
     }
+    
+    synchronized void incrHealthCheckFailed() {
+      healthCheckFailedStat.inc();
+    }
 
     synchronized void remove() {
       collector.removeStat(totalTasksKey);
       collector.removeStat(succeededTasksKey);
+      collector.removeStat(healthCheckFailedKey);
     }
 
   }

Modified: hadoop/mapreduce/trunk/src/test/commit-tests
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/test/commit-tests?rev=802502&r1=802501&r2=802502&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/test/commit-tests (original)
+++ hadoop/mapreduce/trunk/src/test/commit-tests Sun Aug  9 09:45:31 2009
@@ -35,3 +35,5 @@
 **/TestTextInputFormat.java
 **/TestTextOutputFormat.java
 **/TestTrackerBlacklistAcrossJobs.java
+**/TestTaskTrackerBlacklisting.java
+

Modified: hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/mapred/TestTaskTrackerBlacklisting.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/mapred/TestTaskTrackerBlacklisting.java?rev=802502&r1=802501&r2=802502&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/mapred/TestTaskTrackerBlacklisting.java
(original)
+++ hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/mapred/TestTaskTrackerBlacklisting.java
Sun Aug  9 09:45:31 2009
@@ -207,6 +207,58 @@
     assertEquals("Trackers still blacklisted after healthy report", jobTracker
         .getBlacklistedTrackerCount(), 0);
   }
+  
+  
+  /**
+   * Test case to check if the task tracker node health failure statistics
+   * is populated correctly.
+   * 
+   * We check the since start property and assume that other properties would
+   * be populated in a correct manner.
+   */
+  public void testTaskTrackerNodeHealthFailureStatistics() throws Exception {
+    //populate previous failure count, as the job tracker is bought up only
+    //once in setup of test cases to run all node health blacklist stuff.
+    int failureCount = getFailureCountSinceStart(jobTracker, trackers[0]);
+    sendHeartBeat(null, false);
+    for(String tracker: trackers) {
+      assertEquals("Failure count updated wrongly for tracker : " + tracker,
+          failureCount, getFailureCountSinceStart(jobTracker, tracker));
+    }
+    
+    TaskTrackerHealthStatus status = getUnhealthyNodeStatus("ERROR");
+    sendHeartBeat(status, false);
+    //When the node fails due to health check, the statistics is 
+    //incremented.
+    failureCount++;
+    for(String tracker: trackers) {
+      assertEquals("Failure count updated wrongly for tracker : " + tracker,
+          failureCount, getFailureCountSinceStart(jobTracker, tracker));
+    }
+    //even if the node reports unhealthy in next status update we dont
+    //increment it. We increment the statistics if the node goes back to
+    //healthy and then becomes unhealthy.
+    sendHeartBeat(status, false);
+    for(String tracker: trackers) {
+      assertEquals("Failure count updated wrongly for tracker : " + tracker,
+          failureCount, getFailureCountSinceStart(jobTracker, tracker));
+    }
+    //make nodes all healthy, but the failure statistics should be 
+    //carried forward.
+    sendHeartBeat(null, false);
+    for(String tracker: trackers) {
+      assertEquals("Failure count updated wrongly for tracker : " + tracker,
+          failureCount, getFailureCountSinceStart(jobTracker, tracker));
+    }
+  }
+  
+  private int getFailureCountSinceStart(JobTracker jt, String tracker) {
+    JobTrackerStatistics jtStats = jt.getStatistics();
+    StatisticsCollector collector = jtStats.collector;
+    collector.update();
+    return jtStats.getTaskTrackerStat(tracker).healthCheckFailedStat
+        .getValues().get(StatisticsCollector.SINCE_START).getValue();
+  }
 
   public void testBlackListingWithFailuresAndHealthStatus() throws Exception {
     runBlackListingJob(jobTracker, trackers);

Modified: hadoop/mapreduce/trunk/src/webapps/job/machines.jsp
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/webapps/job/machines.jsp?rev=802502&r1=802501&r2=802502&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/webapps/job/machines.jsp (original)
+++ hadoop/mapreduce/trunk/src/webapps/job/machines.jsp Sun Aug  9 09:45:31 2009
@@ -51,7 +51,7 @@
       c = tracker.taskTrackers();
     }
     int noCols = 9 + 
-      (2 * tracker.getStatistics().collector.DEFAULT_COLLECT_WINDOWS.length);
+      (3 * tracker.getStatistics().collector.DEFAULT_COLLECT_WINDOWS.length);
     if(type.equals("blacklisted")) {
       noCols = noCols + 1;
     }
@@ -75,8 +75,9 @@
            collector.DEFAULT_COLLECT_WINDOWS) {
          out.println("<td><b>Total Tasks "+window.name+"</b></td>");
          out.println("<td><b>Succeeded Tasks "+window.name+"</b></td>");
+         out.println("<td><b>Failed Health Checks " 
+               + window.name+"</b></td>"); 
        }
-      
       out.print("<td><b>Seconds since heartbeat</b></td></tr>\n");
 
       int maxFailures = 0;
@@ -127,6 +128,8 @@
                                 get(window).getValue());
           out.println("</td><td>" + ttStat.succeededTasksStat.getValues().
                                 get(window).getValue());
+          out.println("</td><td>" + ttStat.healthCheckFailedStat.
+              getValues().get(window).getValue());
         }
         
         out.print("</td><td>" + sinceHeartbeat + "</td></tr>\n");



Mime
View raw message