hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From cutt...@apache.org
Subject svn commit: r395284 - in /lucene/hadoop/trunk: CHANGES.txt src/java/org/apache/hadoop/dfs/FSNamesystem.java src/java/org/apache/hadoop/mapred/TaskTracker.java src/java/org/apache/hadoop/mapred/TaskTrackerStatus.java src/webapps/mapred/jobtracker.jsp
Date Wed, 19 Apr 2006 16:08:23 GMT
Author: cutting
Date: Wed Apr 19 09:08:20 2006
New Revision: 395284

URL: http://svn.apache.org/viewcvs?rev=395284&view=rev
Log:
Fix HADOOP-148.  Maintain a task failure count per tasktracker and display it in the web ui.
 Contributed by Owen.

Modified:
    lucene/hadoop/trunk/CHANGES.txt
    lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java
    lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java
    lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTrackerStatus.java
    lucene/hadoop/trunk/src/webapps/mapred/jobtracker.jsp

Modified: lucene/hadoop/trunk/CHANGES.txt
URL: http://svn.apache.org/viewcvs/lucene/hadoop/trunk/CHANGES.txt?rev=395284&r1=395283&r2=395284&view=diff
==============================================================================
--- lucene/hadoop/trunk/CHANGES.txt (original)
+++ lucene/hadoop/trunk/CHANGES.txt Wed Apr 19 09:08:20 2006
@@ -61,6 +61,9 @@
 17. Fix HADOOP-142.  Avoid re-running a task on a host where it has
     previously failed.  (omalley via cutting)
 
+18. Fix HADOOP-148.  Maintain a task failure count for each
+    tasktracker and display it in the web ui.  (omalley via cutting)
+
 
 Release 0.1.1 - 2006-04-08
 

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java
URL: http://svn.apache.org/viewcvs/lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java?rev=395284&r1=395283&r2=395284&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java Wed Apr 19 09:08:20
2006
@@ -243,7 +243,9 @@
                                             short replication 
                                           ) throws IOException {
         if (pendingCreates.get(src) != null) {
-          LOG.warning("Cannot start file because pendingCreates is non-null. src=" + src);
+          LOG.warning("Cannot create file " + src + " for " + holder +
+                       " on " + clientMachine + 
+                       " because pendingCreates is non-null.");
           return null;
         }
 

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java
URL: http://svn.apache.org/viewcvs/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java?rev=395284&r1=395283&r2=395284&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java Wed Apr 19 09:08:20
2006
@@ -69,6 +69,7 @@
     private MapOutputFile mapOutputFile;
 
     private int maxCurrentTasks;
+    private int failures;
 
     class MapOutputServer extends RPC.Server {
       private MapOutputServer(int port, int threads) {
@@ -255,7 +256,10 @@
                 this.fs = FileSystem.getNamed(jobClient.getFilesystemName(), this.fConf);
             }
             
-            int resultCode = jobClient.emitHeartbeat(new TaskTrackerStatus(taskTrackerName,
localHostname, mapOutputPort, taskReports), justStarted);
+            TaskTrackerStatus status = 
+              new TaskTrackerStatus(taskTrackerName, localHostname, 
+                                    mapOutputPort, taskReports, failures); 
+            int resultCode = jobClient.emitHeartbeat(status, justStarted);
             justStarted = false;
               
             if (resultCode == InterTrackerProtocol.UNKNOWN_TASKTRACKER) {
@@ -279,10 +283,11 @@
                 for (Iterator it = runningTasks.values().iterator(); it.hasNext(); ) {
                     TaskInProgress tip = (TaskInProgress) it.next();
                     if ((tip.getRunState() == TaskStatus.RUNNING) &&
-                        (System.currentTimeMillis() - tip.getLastProgressReport() > this.taskTimeout))
{
+                        (System.currentTimeMillis() - tip.getLastProgressReport() > this.taskTimeout)
&&
+                        !tip.wasKilled) {
                         LOG.info("Task " + tip.getTask().getTaskId() + " timed out.  Killing.");
                         tip.reportDiagnosticInfo("Timed out.");
-                        tip.killAndCleanup();
+                        tip.killAndCleanup(true);
                     }
                 }
             }
@@ -531,6 +536,9 @@
             if (done) {
                 runstate = TaskStatus.SUCCEEDED;
             } else {
+                if (!wasKilled) {
+                  failures += 1;
+                }
                 runstate = TaskStatus.FAILED;
             }
 
@@ -554,7 +562,7 @@
          */
         public synchronized void jobHasFinished() throws IOException {
             if (getRunState() == TaskStatus.RUNNING) {
-                killAndCleanup();
+                killAndCleanup(false);
             } else {
                 cleanup();
             }
@@ -563,9 +571,13 @@
         /**
          * This task has run on too long, and should be killed.
          */
-        public synchronized void killAndCleanup() throws IOException {
+        public synchronized void killAndCleanup(boolean wasFailure
+                                                ) throws IOException {
             if (runstate == TaskStatus.RUNNING) {
                 wasKilled = true;
+                if (wasFailure) {
+                  failures += 1;
+                }
                 runner.kill();
             }
         }

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTrackerStatus.java
URL: http://svn.apache.org/viewcvs/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTrackerStatus.java?rev=395284&r1=395283&r2=395284&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTrackerStatus.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTrackerStatus.java Wed Apr 19
09:08:20 2006
@@ -41,6 +41,7 @@
     String trackerName;
     String host;
     int port;
+    int failures;
     Vector taskReports;
     
     volatile long lastSeen;
@@ -52,13 +53,15 @@
 
     /**
      */
-    public TaskTrackerStatus(String trackerName, String host, int port, Vector taskReports)
{
+    public TaskTrackerStatus(String trackerName, String host, int port, 
+                             Vector taskReports, int failures) {
         this.trackerName = trackerName;
         this.host = host;
         this.port = port;
 
         this.taskReports = new Vector();
         this.taskReports.addAll(taskReports);
+        this.failures = failures;
     }
 
     /**
@@ -78,6 +81,14 @@
     }
 
     /**
+     * Get the number of tasks that have failed on this tracker.
+     * @return The number of failed tasks
+     */
+    public int getFailures() {
+      return failures;
+    }
+    
+    /**
      * All current tasks at the TaskTracker.  
      *
      * Tasks are tracked by a TaskStatus object.
@@ -127,6 +138,7 @@
         out.writeInt(port);
 
         out.writeInt(taskReports.size());
+        out.writeInt(failures);
         for (Iterator it = taskReports.iterator(); it.hasNext(); ) {
             ((TaskStatus) it.next()).write(out);
         }
@@ -143,6 +155,7 @@
         taskReports.clear();
 
         int numTasks = in.readInt();
+        this.failures = in.readInt();
         for (int i = 0; i < numTasks; i++) {
             TaskStatus tmp = new TaskStatus();
             tmp.readFields(in);

Modified: lucene/hadoop/trunk/src/webapps/mapred/jobtracker.jsp
URL: http://svn.apache.org/viewcvs/lucene/hadoop/trunk/src/webapps/mapred/jobtracker.jsp?rev=395284&r1=395283&r2=395284&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/webapps/mapred/jobtracker.jsp (original)
+++ lucene/hadoop/trunk/src/webapps/mapred/jobtracker.jsp Wed Apr 19 09:08:20 2006
@@ -20,9 +20,12 @@
     } else {
       out.print("<center>\n");
       out.print("<table border=\"2\" cellpadding=\"5\" cellspacing=\"2\">\n");
-      out.print("<tr><td align=\"center\" colspan=\"4\"><b>Task Trackers</b></td></tr>\n");
-      out.print("<tr><td><b>Name</b></td><td><b>Host</b></td><td><b>#
running tasks</b></td><td><b>Secs since heartbeat</b></td></tr>\n");
-
+      out.print("<tr><td align=\"center\" colspan=\"5\"><b>Task Trackers</b></td></tr>\n");
+      out.print("<tr><td><b>Name</b></td><td><b>Host</b></td>"
+
+                "<td><b># running tasks</b></td><td><b>Failures</b></td>"
+
+                "<td><b>Secs since heartbeat</b></td></tr>\n");
+      int maxFailures = 0;
+      String failureKing = null;
       for (Iterator it = c.iterator(); it.hasNext(); ) {
         TaskTrackerStatus tt = (TaskTrackerStatus) it.next();
         long sinceHeartbeat = System.currentTimeMillis() - tt.getLastSeen();
@@ -34,11 +37,23 @@
           it2.next();
           numCurTasks++;
         }
+        int numFailures = tt.getFailures();
+        if (numFailures > maxFailures) {
+          maxFailures = numFailures;
+          failureKing = tt.getTrackerName();
+        }
 
-        out.print("<tr><td>" + tt.getTrackerName() + "</td><td>"
+ tt.getHost() + "</td><td>" + numCurTasks + "</td><td>" + sinceHeartbeat
+ "</td></tr>\n");
+        out.print("<tr><td>" + tt.getTrackerName() + "</td><td>"
+ 
+                  tt.getHost() + "</td><td>" + numCurTasks +
+                  "</td><td>" + numFailures + 
+                  "</td><td>" + sinceHeartbeat + "</td></tr>\n");
       }
       out.print("</table>\n");
       out.print("</center>\n");
+      if (maxFailures > 0) {
+        out.print("Highest Failures: " + failureKing + " with " + maxFailures + 
+                  " failures<br>\n");
+      }
     }
   }
 



Mime
View raw message