Return-Path: Delivered-To: apmail-lucene-hadoop-commits-archive@locus.apache.org Received: (qmail 10947 invoked from network); 19 Apr 2006 16:09:01 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (209.237.227.199) by minotaur.apache.org with SMTP; 19 Apr 2006 16:09:01 -0000 Received: (qmail 56742 invoked by uid 500); 19 Apr 2006 16:09:01 -0000 Delivered-To: apmail-lucene-hadoop-commits-archive@lucene.apache.org Received: (qmail 56715 invoked by uid 500); 19 Apr 2006 16:09:01 -0000 Mailing-List: contact hadoop-commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: hadoop-dev@lucene.apache.org Delivered-To: mailing list hadoop-commits@lucene.apache.org Received: (qmail 56689 invoked by uid 99); 19 Apr 2006 16:09:00 -0000 Received: from asf.osuosl.org (HELO asf.osuosl.org) (140.211.166.49) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 19 Apr 2006 09:09:00 -0700 X-ASF-Spam-Status: No, hits=-8.6 required=10.0 tests=ALL_TRUSTED,INFO_TLD,NO_REAL_NAME X-Spam-Check-By: apache.org Received: from [209.237.227.194] (HELO minotaur.apache.org) (209.237.227.194) by apache.org (qpsmtpd/0.29) with SMTP; Wed, 19 Apr 2006 09:08:59 -0700 Received: (qmail 10560 invoked by uid 65534); 19 Apr 2006 16:08:37 -0000 Message-ID: <20060419160836.10504.qmail@minotaur.apache.org> Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r395284 - in /lucene/hadoop/trunk: CHANGES.txt src/java/org/apache/hadoop/dfs/FSNamesystem.java src/java/org/apache/hadoop/mapred/TaskTracker.java src/java/org/apache/hadoop/mapred/TaskTrackerStatus.java src/webapps/mapred/jobtracker.jsp Date: Wed, 19 Apr 2006 16:08:23 -0000 To: hadoop-commits@lucene.apache.org From: cutting@apache.org X-Mailer: svnmailer-1.0.8 X-Virus-Checked: Checked by ClamAV on apache.org X-Spam-Rating: minotaur.apache.org 1.6.2 0/1000/N Author: cutting Date: Wed Apr 19 09:08:20 2006 New Revision: 395284 URL: http://svn.apache.org/viewcvs?rev=395284&view=rev Log: Fix HADOOP-148. Maintain a task failure count per tasktracker and display it in the web ui. Contributed by Owen. Modified: lucene/hadoop/trunk/CHANGES.txt lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTrackerStatus.java lucene/hadoop/trunk/src/webapps/mapred/jobtracker.jsp Modified: lucene/hadoop/trunk/CHANGES.txt URL: http://svn.apache.org/viewcvs/lucene/hadoop/trunk/CHANGES.txt?rev=395284&r1=395283&r2=395284&view=diff ============================================================================== --- lucene/hadoop/trunk/CHANGES.txt (original) +++ lucene/hadoop/trunk/CHANGES.txt Wed Apr 19 09:08:20 2006 @@ -61,6 +61,9 @@ 17. Fix HADOOP-142. Avoid re-running a task on a host where it has previously failed. (omalley via cutting) +18. Fix HADOOP-148. Maintain a task failure count for each + tasktracker and display it in the web ui. (omalley via cutting) + Release 0.1.1 - 2006-04-08 Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java URL: http://svn.apache.org/viewcvs/lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java?rev=395284&r1=395283&r2=395284&view=diff ============================================================================== --- lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java (original) +++ lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java Wed Apr 19 09:08:20 2006 @@ -243,7 +243,9 @@ short replication ) throws IOException { if (pendingCreates.get(src) != null) { - LOG.warning("Cannot start file because pendingCreates is non-null. src=" + src); + LOG.warning("Cannot create file " + src + " for " + holder + + " on " + clientMachine + + " because pendingCreates is non-null."); return null; } Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java URL: http://svn.apache.org/viewcvs/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java?rev=395284&r1=395283&r2=395284&view=diff ============================================================================== --- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java (original) +++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java Wed Apr 19 09:08:20 2006 @@ -69,6 +69,7 @@ private MapOutputFile mapOutputFile; private int maxCurrentTasks; + private int failures; class MapOutputServer extends RPC.Server { private MapOutputServer(int port, int threads) { @@ -255,7 +256,10 @@ this.fs = FileSystem.getNamed(jobClient.getFilesystemName(), this.fConf); } - int resultCode = jobClient.emitHeartbeat(new TaskTrackerStatus(taskTrackerName, localHostname, mapOutputPort, taskReports), justStarted); + TaskTrackerStatus status = + new TaskTrackerStatus(taskTrackerName, localHostname, + mapOutputPort, taskReports, failures); + int resultCode = jobClient.emitHeartbeat(status, justStarted); justStarted = false; if (resultCode == InterTrackerProtocol.UNKNOWN_TASKTRACKER) { @@ -279,10 +283,11 @@ for (Iterator it = runningTasks.values().iterator(); it.hasNext(); ) { TaskInProgress tip = (TaskInProgress) it.next(); if ((tip.getRunState() == TaskStatus.RUNNING) && - (System.currentTimeMillis() - tip.getLastProgressReport() > this.taskTimeout)) { + (System.currentTimeMillis() - tip.getLastProgressReport() > this.taskTimeout) && + !tip.wasKilled) { LOG.info("Task " + tip.getTask().getTaskId() + " timed out. Killing."); tip.reportDiagnosticInfo("Timed out."); - tip.killAndCleanup(); + tip.killAndCleanup(true); } } } @@ -531,6 +536,9 @@ if (done) { runstate = TaskStatus.SUCCEEDED; } else { + if (!wasKilled) { + failures += 1; + } runstate = TaskStatus.FAILED; } @@ -554,7 +562,7 @@ */ public synchronized void jobHasFinished() throws IOException { if (getRunState() == TaskStatus.RUNNING) { - killAndCleanup(); + killAndCleanup(false); } else { cleanup(); } @@ -563,9 +571,13 @@ /** * This task has run on too long, and should be killed. */ - public synchronized void killAndCleanup() throws IOException { + public synchronized void killAndCleanup(boolean wasFailure + ) throws IOException { if (runstate == TaskStatus.RUNNING) { wasKilled = true; + if (wasFailure) { + failures += 1; + } runner.kill(); } } Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTrackerStatus.java URL: http://svn.apache.org/viewcvs/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTrackerStatus.java?rev=395284&r1=395283&r2=395284&view=diff ============================================================================== --- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTrackerStatus.java (original) +++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTrackerStatus.java Wed Apr 19 09:08:20 2006 @@ -41,6 +41,7 @@ String trackerName; String host; int port; + int failures; Vector taskReports; volatile long lastSeen; @@ -52,13 +53,15 @@ /** */ - public TaskTrackerStatus(String trackerName, String host, int port, Vector taskReports) { + public TaskTrackerStatus(String trackerName, String host, int port, + Vector taskReports, int failures) { this.trackerName = trackerName; this.host = host; this.port = port; this.taskReports = new Vector(); this.taskReports.addAll(taskReports); + this.failures = failures; } /** @@ -78,6 +81,14 @@ } /** + * Get the number of tasks that have failed on this tracker. + * @return The number of failed tasks + */ + public int getFailures() { + return failures; + } + + /** * All current tasks at the TaskTracker. * * Tasks are tracked by a TaskStatus object. @@ -127,6 +138,7 @@ out.writeInt(port); out.writeInt(taskReports.size()); + out.writeInt(failures); for (Iterator it = taskReports.iterator(); it.hasNext(); ) { ((TaskStatus) it.next()).write(out); } @@ -143,6 +155,7 @@ taskReports.clear(); int numTasks = in.readInt(); + this.failures = in.readInt(); for (int i = 0; i < numTasks; i++) { TaskStatus tmp = new TaskStatus(); tmp.readFields(in); Modified: lucene/hadoop/trunk/src/webapps/mapred/jobtracker.jsp URL: http://svn.apache.org/viewcvs/lucene/hadoop/trunk/src/webapps/mapred/jobtracker.jsp?rev=395284&r1=395283&r2=395284&view=diff ============================================================================== --- lucene/hadoop/trunk/src/webapps/mapred/jobtracker.jsp (original) +++ lucene/hadoop/trunk/src/webapps/mapred/jobtracker.jsp Wed Apr 19 09:08:20 2006 @@ -20,9 +20,12 @@ } else { out.print("
\n"); out.print("\n"); - out.print("\n"); - out.print("\n"); - + out.print("\n"); + out.print("" + + "" + + "\n"); + int maxFailures = 0; + String failureKing = null; for (Iterator it = c.iterator(); it.hasNext(); ) { TaskTrackerStatus tt = (TaskTrackerStatus) it.next(); long sinceHeartbeat = System.currentTimeMillis() - tt.getLastSeen(); @@ -34,11 +37,23 @@ it2.next(); numCurTasks++; } + int numFailures = tt.getFailures(); + if (numFailures > maxFailures) { + maxFailures = numFailures; + failureKing = tt.getTrackerName(); + } - out.print("\n"); + out.print("\n"); } out.print("
Task Trackers
NameHost# running tasksSecs since heartbeat
Task Trackers
NameHost# running tasksFailuresSecs since heartbeat
" + tt.getTrackerName() + "" + tt.getHost() + "" + numCurTasks + "" + sinceHeartbeat + "
" + tt.getTrackerName() + "" + + tt.getHost() + "" + numCurTasks + + "" + numFailures + + "" + sinceHeartbeat + "
\n"); out.print("
\n"); + if (maxFailures > 0) { + out.print("Highest Failures: " + failureKing + " with " + maxFailures + + " failures
\n"); + } } }