hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tomwh...@apache.org
Subject svn commit: r531832 - in /lucene/hadoop/trunk: ./ src/java/org/apache/hadoop/mapred/ src/webapps/job/
Date Tue, 24 Apr 2007 08:35:34 GMT
Author: tomwhite
Date: Tue Apr 24 01:35:33 2007
New Revision: 531832

URL: http://svn.apache.org/viewvc?view=rev&rev=531832
Log:
HADOOP-1050.  Distinguish between failed and killed tasks so as to not count a lost tasktracker
against the job.  Contributed by Arun C Murthy.

Modified:
    lucene/hadoop/trunk/CHANGES.txt
    lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobInProgress.java
    lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java
    lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskInProgress.java
    lucene/hadoop/trunk/src/webapps/job/jobdetails.jsp
    lucene/hadoop/trunk/src/webapps/job/jobfailures.jsp

Modified: lucene/hadoop/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/CHANGES.txt?view=diff&rev=531832&r1=531831&r2=531832
==============================================================================
--- lucene/hadoop/trunk/CHANGES.txt (original)
+++ lucene/hadoop/trunk/CHANGES.txt Tue Apr 24 01:35:33 2007
@@ -238,6 +238,10 @@
     rename causing possible reduce task hang.
     (Tahir Hashmi via tomwhite)
 
+72. HADOOP-1050.  Distinguish between failed and killed tasks so as to 
+    not count a lost tasktracker against the job.  
+    (Arun C Murthy via tomwhite)
+
 
 Release 0.12.3 - 2007-04-06
 

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobInProgress.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobInProgress.java?view=diff&rev=531832&r1=531831&r2=531832
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobInProgress.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobInProgress.java Tue Apr 24 01:35:33
2007
@@ -782,7 +782,7 @@
                           TaskStatus status, String trackerName,
                           boolean wasRunning, boolean wasComplete) {
     // Mark the taskid as a 'failure'
-    tip.failedSubTask(taskid, trackerName);
+    tip.incompleteSubTask(taskid, trackerName);
         
     boolean isRunning = tip.isRunning();
     boolean isComplete = tip.isComplete();
@@ -861,14 +861,14 @@
    * @param reason The reason that the task failed
    * @param trackerName The task tracker the task failed on
    */
-  public void failedTask(TaskInProgress tip, String taskid, 
-                         String reason, TaskStatus.Phase phase, 
+  public void failedTask(TaskInProgress tip, String taskid, String reason, 
+                         TaskStatus.Phase phase, TaskStatus.State state, 
                          String hostname, String trackerName,
                          JobTrackerMetrics metrics) {
     TaskStatus status = new TaskStatus(taskid,
                                        tip.isMapTask(),
                                        0.0f,
-                                       TaskStatus.State.FAILED,
+                                       state,
                                        reason,
                                        reason,
                                        trackerName, phase,

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java?view=diff&rev=531832&r1=531831&r2=531832
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java Tue Apr 24 01:35:33
2007
@@ -194,6 +194,7 @@
                       job.failedTask(tip, taskId, "Error launching task", 
                                      tip.isMapTask()? TaskStatus.Phase.MAP:
                                      TaskStatus.Phase.STARTING,
+                                     TaskStatus.State.FAILED,
                                      trackerStatus.getHost(), trackerName,
                                      myMetrics);
                   }
@@ -1676,8 +1677,8 @@
           // if the job is done, we don't want to change anything
           if (job.getStatus().getRunState() == JobStatus.RUNNING) {
             job.failedTask(tip, taskId, "Lost task tracker", 
-                           TaskStatus.Phase.MAP, hostname, trackerName, 
-                           myMetrics);
+                           TaskStatus.Phase.MAP, TaskStatus.State.KILLED,
+                           hostname, trackerName, myMetrics);
           }
         } else if (!tip.isMapTask() && tip.isComplete()) {
           // Completed 'reduce' task, not failed;

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskInProgress.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskInProgress.java?view=diff&rev=531832&r1=531831&r2=531832
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskInProgress.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskInProgress.java Tue Apr 24 01:35:33
2007
@@ -72,6 +72,7 @@
   // Status of the TIP
   private int successEventNumber = -1;
   private int numTaskFailures = 0;
+  private int numKilledTasks = 0;
   private double progress = 0;
   private String state = "";
   private long startTime = 0;
@@ -246,6 +247,13 @@
   }
 
   /**
+   * Number of times the TaskInProgress has been killed by the framework.
+   */
+  public int numKilledTasks() {
+    return numKilledTasks;
+  }
+
+  /**
    * Get the overall progress (from 0 to 1.0) for this TIP
    */
   public double getProgress() {
@@ -374,25 +382,40 @@
    * Indicate that one of the taskids in this TaskInProgress
    * has failed.
    */
-  public void failedSubTask(String taskid, String trackerName) {
+  public void incompleteSubTask(String taskid, String trackerName) {
     //
     // Note the failure and its location
     //
     LOG.info("Task '" + taskid + "' has been lost.");
     TaskStatus status = taskStatuses.get(taskid);
+    TaskStatus.State taskState = TaskStatus.State.FAILED;
     if (status != null) {
-      status.setRunState(TaskStatus.State.FAILED);
+      taskState = status.getRunState();
+      if (taskState != TaskStatus.State.FAILED && 
+              taskState != TaskStatus.State.KILLED) {
+        LOG.info("Task '" + taskid + "' running on '" + trackerName + 
+                "' in state: '" + taskState + "' being failed!");
+        status.setRunState(TaskStatus.State.FAILED);
+        taskState = TaskStatus.State.FAILED;
+      }
+
       // tasktracker went down and failed time was not reported. 
       if (0 == status.getFinishTime()){
         status.setFinishTime(System.currentTimeMillis());
       }
     }
+
     this.activeTasks.remove(taskid);
     if (this.completes > 0 && this.isMapTask()) {
       this.completes--;
     }
 
-    numTaskFailures++;
+    if (taskState == TaskStatus.State.FAILED) {
+      numTaskFailures++;
+    } else {
+      numKilledTasks++;
+    }
+
     if (numTaskFailures >= MAX_TASK_FAILURES) {
       LOG.info("TaskInProgress " + getTIPId() + " has failed " + numTaskFailures + " times.");
       kill();
@@ -553,14 +576,15 @@
       execStartTime = System.currentTimeMillis();
     }
 
-    // Create the 'taskid'
+    // Create the 'taskid'; do not count the 'killed' tasks against the job!
     String taskid = null;
-    if (nextTaskId < (MAX_TASK_EXECS + MAX_TASK_FAILURES)) {
+    if (nextTaskId < (MAX_TASK_EXECS + MAX_TASK_FAILURES + numKilledTasks)) {
       taskid = new String("task_" + taskIdPrefix + "_" + nextTaskId);
       ++nextTaskId;
     } else {
-      LOG.warn("Exceeded limit of " + (MAX_TASK_EXECS + MAX_TASK_FAILURES) + 
-               " attempts for the tip '" + getTIPId() + "'");
+      LOG.warn("Exceeded limit of " + (MAX_TASK_EXECS + MAX_TASK_FAILURES) +
+              " (plus " + numKilledTasks + " killed)"  + 
+              " attempts for the tip '" + getTIPId() + "'");
       return null;
     }
         

Modified: lucene/hadoop/trunk/src/webapps/job/jobdetails.jsp
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/webapps/job/jobdetails.jsp?view=diff&rev=531832&r1=531831&r2=531832
==============================================================================
--- lucene/hadoop/trunk/src/webapps/job/jobdetails.jsp (original)
+++ lucene/hadoop/trunk/src/webapps/job/jobdetails.jsp Tue Apr 24 01:35:33 2007
@@ -26,7 +26,8 @@
     int runningTasks = 0;
     int finishedTasks = 0;
     int killedTasks = 0;
-    int failures = 0;
+    int failedTaskAttempts = 0;
+    int killedTaskAttempts = 0;
     for(int i=0; i < totalTasks; ++i) {
       TaskInProgress task = tasks[i];
       if (task.isComplete()) {
@@ -36,7 +37,8 @@
       } else if (task.wasKilled()) {
         killedTasks += 1;
       }
-      failures += task.numTaskFailures();
+      failedTaskAttempts += task.numTaskFailures();
+      killedTaskAttempts += task.numKilledTasks();
     }
     out.print("<tr><th><a href=\"/jobtasks.jsp?jobid=" + jobId + 
               "&type="+ kind + "&pagenum=1\">" + kind + 
@@ -52,9 +54,21 @@
               finishedTasks + 
               "</td><td align=\"right\">" +
               killedTasks +
-              "</td><td align=\"right\"><a href=\"/jobfailures.jsp?jobid="
+ jobId +
-              "&kind=" + kind + "\">" +
-              failures + "</a></td></tr>\n");
+              "</td><td align=\"right\">" + 
+              ((failedTaskAttempts > 0) ? 
+                  new String("<a href=\"/jobfailures.jsp?jobid=" + jobId + 
+                      "&kind=" + kind + "&cause=failed\">" + failedTaskAttempts
+ 
+                      "</a>") : 
+                  "0"
+                  ) + 
+              " / " +
+              ((killedTaskAttempts > 0) ? 
+                  new String("<a href=\"/jobfailures.jsp?jobid=" + jobId + 
+                      "&kind=" + kind + "&cause=killed\">" + killedTaskAttempts
+ 
+                      "</a>") : 
+                  "0"
+                  ) + 
+              "</td></tr>\n");
   }
 %>       
 <%   
@@ -126,7 +140,7 @@
               "<th>Pending</th><th>Running</th><th>Complete</th>"
+
               "<th>Killed</th>" +
               "<th><a href=\"/jobfailures.jsp?jobid=" + jobId + 
-              "\">Failures</a></th></tr>\n");
+              "\">Failed/Killed<br>Task Attempts</a></th></tr>\n");
     printTaskSummary(out, jobId, "map", status.mapProgress(), 
                      job.getMapTasks());
     printTaskSummary(out, jobId, "reduce", status.reduceProgress(),

Modified: lucene/hadoop/trunk/src/webapps/job/jobfailures.jsp
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/webapps/job/jobfailures.jsp?view=diff&rev=531832&r1=531831&r2=531832
==============================================================================
--- lucene/hadoop/trunk/src/webapps/job/jobfailures.jsp (original)
+++ lucene/hadoop/trunk/src/webapps/job/jobfailures.jsp Tue Apr 24 01:35:33 2007
@@ -15,11 +15,14 @@
   
   private void printFailedAttempts(JspWriter out,
                                    String jobId,
-                                   TaskInProgress tip) throws IOException {
+                                   TaskInProgress tip,
+                                   TaskStatus.State failState) throws IOException {
     TaskStatus[] statuses = tip.getTaskStatuses();
     String tipId = tip.getTIPId();
     for(int i=0; i < statuses.length; ++i) {
-      if (statuses[i].getRunState() == TaskStatus.State.FAILED) {
+      TaskStatus.State taskState = statuses[i].getRunState();
+      if ((failState == null && (taskState == TaskStatus.State.FAILED || 
+          taskState == TaskStatus.State.KILLED)) || taskState == failState) {
         String taskTrackerName = statuses[i].getTaskTracker();
         TaskTrackerStatus taskTracker = tracker.getTaskTracker(taskTrackerName);
         out.print("<tr><td>" + statuses[i].getTaskId() +
@@ -33,6 +36,7 @@
                     taskTracker.getHttpPort() + "\">" +  taskTracker.getHost() + 
                     "</a></td>");
         }
+        out.print("<td>" + taskState + "</td>");
         out.print("<td><pre>");
         List<String> failures = 
                      tracker.getTaskDiagnostics(jobId, tipId, 
@@ -72,12 +76,14 @@
              
   private void printFailures(JspWriter out, 
                              String jobId,
-                             String kind) throws IOException {
+                             String kind, 
+                             String cause) throws IOException {
     JobInProgress job = (JobInProgress) tracker.getJob(jobId);
     if (job == null) {
       out.print("<b>Job " + jobId + " not found.</b><br>\n");
       return;
     }
+    
     boolean includeMap = false;
     boolean includeReduce = false;
     if (kind == null) {
@@ -94,19 +100,35 @@
       out.print("<b>Kind " + kind + " not supported.</b><br>\n");
       return;
     }
+    
+    TaskStatus.State state = null;
+    try {
+      if (cause != null) {
+        state = TaskStatus.State.valueOf(cause.toUpperCase());
+        if (state != TaskStatus.State.FAILED && state != TaskStatus.State.KILLED)
{
+          out.print("<b>Cause '" + cause + 
+              "' is not an 'unsuccessful' state.</b><br>\n");
+          return;
+        }
+      }
+    } catch (IllegalArgumentException e) {
+      out.print("<b>Cause " + cause + " not supported.</b><br>\n");
+      return;
+    }
+    	
     out.print("<table border=2 cellpadding=\"5\" cellspacing=\"2\">");
-    out.print("<tr><th>Attempt</th><th>Task</th><th>Machine</th>"
+
+    out.print("<tr><th>Attempt</th><th>Task</th><th>Machine</th><th>State</th>"
+
               "<th>Error</th><th>Logs</th></tr>\n");
     if (includeMap) {
       TaskInProgress[] tips = job.getMapTasks();
       for(int i=0; i < tips.length; ++i) {
-        printFailedAttempts(out, jobId, tips[i]);
+        printFailedAttempts(out, jobId, tips[i], state);
       }
     }
     if (includeReduce) {
       TaskInProgress[] tips = job.getReduceTasks();
       for(int i=0; i < tips.length; ++i) {
-        printFailedAttempts(out, jobId, tips[i]);
+        printFailedAttempts(out, jobId, tips[i], state);
       }
     }
     out.print("</table>\n");
@@ -116,6 +138,7 @@
 <%
     String jobId = request.getParameter("jobid");
     String kind = request.getParameter("kind");
+    String cause = request.getParameter("cause");
 %>
 
 <html>
@@ -125,7 +148,7 @@
 failures on <a href="/jobtracker.jsp"><%=trackerName%></a></h1>
 
 <% 
-    printFailures(out, jobId, kind); 
+    printFailures(out, jobId, kind, cause); 
 %>
 
 <hr>



Mime
View raw message