Return-Path: Delivered-To: apmail-lucene-hadoop-commits-archive@locus.apache.org Received: (qmail 26707 invoked from network); 7 May 2007 21:33:26 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.2) by minotaur.apache.org with SMTP; 7 May 2007 21:33:26 -0000 Received: (qmail 28741 invoked by uid 500); 7 May 2007 21:33:32 -0000 Delivered-To: apmail-lucene-hadoop-commits-archive@lucene.apache.org Received: (qmail 28724 invoked by uid 500); 7 May 2007 21:33:32 -0000 Mailing-List: contact hadoop-commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: hadoop-dev@lucene.apache.org Delivered-To: mailing list hadoop-commits@lucene.apache.org Received: (qmail 28711 invoked by uid 99); 7 May 2007 21:33:32 -0000 Received: from herse.apache.org (HELO herse.apache.org) (140.211.11.133) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 07 May 2007 14:33:32 -0700 X-ASF-Spam-Status: No, hits=-98.6 required=10.0 tests=ALL_TRUSTED,INFO_TLD,NO_REAL_NAME X-Spam-Check-By: apache.org Received: from [140.211.11.3] (HELO eris.apache.org) (140.211.11.3) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 07 May 2007 14:33:24 -0700 Received: by eris.apache.org (Postfix, from userid 65534) id A30F11A9838; Mon, 7 May 2007 14:33:04 -0700 (PDT) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r536000 - in /lucene/hadoop/trunk: ./ src/java/org/apache/hadoop/mapred/ Date: Mon, 07 May 2007 21:33:04 -0000 To: hadoop-commits@lucene.apache.org From: cutting@apache.org X-Mailer: svnmailer-1.1.0 Message-Id: <20070507213304.A30F11A9838@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: cutting Date: Mon May 7 14:33:03 2007 New Revision: 536000 URL: http://svn.apache.org/viewvc?view=rev&rev=536000 Log: HADOOP-1324. Change so that an FSError kills only the task that generates it rather than the entire task tracker. Contributed by Arun. Modified: lucene/hadoop/trunk/CHANGES.txt lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/IsolationRunner.java lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/LocalJobRunner.java lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskRunner.java lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskUmbilicalProtocol.java Modified: lucene/hadoop/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/CHANGES.txt?view=diff&rev=536000&r1=535999&r2=536000 ============================================================================== --- lucene/hadoop/trunk/CHANGES.txt (original) +++ lucene/hadoop/trunk/CHANGES.txt Mon May 7 14:33:03 2007 @@ -356,6 +356,10 @@ More care is also taken to not allocate files on full or offline drives. (Devaraj Das via cutting) +106. HADOOP-1324. Change so that an FSError kills only the task that + generates it rather than the entire task tracker. + (Arun C Murthy via cutting) + Release 0.12.3 - 2007-04-06 Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/IsolationRunner.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/IsolationRunner.java?view=diff&rev=536000&r1=535999&r2=536000 ============================================================================== --- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/IsolationRunner.java (original) +++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/IsolationRunner.java Mon May 7 14:33:03 2007 @@ -47,8 +47,8 @@ LOG.info("Task " + taskid + " reporting done."); } - public void fsError(String message) throws IOException { - LOG.info("Task reporting file system error: " + message); + public void fsError(String taskId, String message) throws IOException { + LOG.info("Task " + taskId + " reporting file system error: " + message); } public Task getTask(String taskid) throws IOException { Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/LocalJobRunner.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/LocalJobRunner.java?view=diff&rev=536000&r1=535999&r2=536000 ============================================================================== --- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/LocalJobRunner.java (original) +++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/LocalJobRunner.java Mon May 7 14:33:03 2007 @@ -236,8 +236,9 @@ } } - public synchronized void fsError(String message) throws IOException { - LOG.fatal("FSError: "+ message); + public synchronized void fsError(String taskId, String message) + throws IOException { + LOG.fatal("FSError: "+ message + "from task: " + taskId); } public TaskCompletionEvent[] getMapCompletionEvents( Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskRunner.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskRunner.java?view=diff&rev=536000&r1=535999&r2=536000 ============================================================================== --- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskRunner.java (original) +++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskRunner.java Mon May 7 14:33:03 2007 @@ -289,7 +289,7 @@ } catch (FSError e) { LOG.fatal("FSError", e); try { - tracker.fsError(e.getMessage()); + tracker.fsError(t.getTaskId(), e.getMessage()); } catch (IOException ie) { LOG.fatal(t.getTaskId()+" reporting FSError", ie); } Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java?view=diff&rev=536000&r1=535999&r2=536000 ============================================================================== --- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java (original) +++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java Mon May 7 14:33:03 2007 @@ -1577,11 +1577,15 @@ } } - /** A child task had a local filesystem error. Exit, so that no future - * jobs are accepted. */ - public synchronized void fsError(String message) throws IOException { - LOG.fatal("FSError, exiting: "+ message); - running = false; + /** + * A child task had a local filesystem error. Kill the task. + */ + public synchronized void fsError(String taskId, String message) + throws IOException { + LOG.fatal("Task: " + taskId + " - Killed due to FSError: " + message); + TaskInProgress tip = runningTasks.get(taskId); + tip.reportDiagnosticInfo("FSError: " + message); + purgeTask(tip); } public TaskCompletionEvent[] getMapCompletionEvents( @@ -1705,7 +1709,7 @@ task.run(job, umbilical); // run the task } catch (FSError e) { LOG.fatal("FSError from child", e); - umbilical.fsError(e.getMessage()); + umbilical.fsError(taskid, e.getMessage()); } catch (Throwable throwable) { LOG.warn("Error running child", throwable); // Report back any failures, for diagnostic purposes Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskUmbilicalProtocol.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskUmbilicalProtocol.java?view=diff&rev=536000&r1=535999&r2=536000 ============================================================================== --- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskUmbilicalProtocol.java (original) +++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskUmbilicalProtocol.java Mon May 7 14:33:03 2007 @@ -63,7 +63,7 @@ void done(String taskid) throws IOException; /** Report that the task encounted a local filesystem error.*/ - void fsError(String message) throws IOException; + void fsError(String taskId, String message) throws IOException; /** Called by a reduce task to get the map output locations for finished maps. *