Return-Path: Delivered-To: apmail-lucene-hadoop-commits-archive@locus.apache.org Received: (qmail 48386 invoked from network); 26 Sep 2007 18:13:38 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.2) by minotaur.apache.org with SMTP; 26 Sep 2007 18:13:38 -0000 Received: (qmail 65919 invoked by uid 500); 26 Sep 2007 18:13:28 -0000 Delivered-To: apmail-lucene-hadoop-commits-archive@lucene.apache.org Received: (qmail 65899 invoked by uid 500); 26 Sep 2007 18:13:27 -0000 Mailing-List: contact hadoop-commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: hadoop-dev@lucene.apache.org Delivered-To: mailing list hadoop-commits@lucene.apache.org Received: (qmail 65668 invoked by uid 99); 26 Sep 2007 18:13:27 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 26 Sep 2007 11:13:27 -0700 X-ASF-Spam-Status: No, hits=-100.0 required=10.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.3] (HELO eris.apache.org) (140.211.11.3) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 26 Sep 2007 18:13:35 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id 92A741A9832; Wed, 26 Sep 2007 11:13:15 -0700 (PDT) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r579744 - in /lucene/hadoop/trunk: CHANGES.txt src/java/org/apache/hadoop/mapred/JobInProgress.java src/java/org/apache/hadoop/mapred/JobTracker.java Date: Wed, 26 Sep 2007 18:13:15 -0000 To: hadoop-commits@lucene.apache.org From: omalley@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20070926181315.92A741A9832@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: omalley Date: Wed Sep 26 11:13:14 2007 New Revision: 579744 URL: http://svn.apache.org/viewvc?rev=579744&view=rev Log: HADOOP-1930. Fix the blame for shuffle failures to the right task tracker. Modified: lucene/hadoop/trunk/CHANGES.txt lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobInProgress.java lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java Modified: lucene/hadoop/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/CHANGES.txt?rev=579744&r1=579743&r2=579744&view=diff ============================================================================== --- lucene/hadoop/trunk/CHANGES.txt (original) +++ lucene/hadoop/trunk/CHANGES.txt Wed Sep 26 11:13:14 2007 @@ -191,6 +191,9 @@ HADOOP-1940. TestDFSUpgradeFromImage must shut down its MiniDFSCluster. (Chris Douglas via nigel) + HADOOP-1930. Fix the blame for failed fetchs on the right host. (Arun C. + Murthy via omalley) + IMPROVEMENTS HADOOP-1908. Restructure data node code so that block sending and Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobInProgress.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobInProgress.java?rev=579744&r1=579743&r2=579744&view=diff ============================================================================== --- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobInProgress.java (original) +++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobInProgress.java Wed Sep 26 11:13:14 2007 @@ -407,7 +407,7 @@ TaskStatus.Phase.MAP : TaskStatus.Phase.REDUCE), TaskStatus.State.FAILED, - ttStatus.getHost(), status.getTaskTracker(), null); + status.getTaskTracker(), null); LOG.info("Failed to copy the output of " + status.getTaskId() + " with: " + StringUtils.stringifyException(ioe)); return; @@ -1046,8 +1046,7 @@ */ public void failedTask(TaskInProgress tip, String taskid, String reason, TaskStatus.Phase phase, TaskStatus.State state, - String hostname, String trackerName, - JobTrackerMetrics metrics) { + String trackerName, JobTrackerMetrics metrics) { TaskStatus status = TaskStatus.createTaskStatus(tip.isMapTask(), taskid, 0.0f, @@ -1148,7 +1147,7 @@ synchronized void fetchFailureNotification(TaskInProgress tip, String mapTaskId, - String hostname, String trackerName, + String trackerName, JobTrackerMetrics metrics) { Integer fetchFailures = mapTaskIdToFetchFailuresMap.get(mapTaskId); fetchFailures = (fetchFailures == null) ? 1 : (fetchFailures+1); @@ -1163,7 +1162,7 @@ failedTask(tip, mapTaskId, "Too many fetch-failures", (tip.isMapTask() ? TaskStatus.Phase.MAP : TaskStatus.Phase.REDUCE), - TaskStatus.State.FAILED, hostname, trackerName, metrics); + TaskStatus.State.FAILED, trackerName, metrics); mapTaskIdToFetchFailuresMap.remove(mapTaskId); } Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java?rev=579744&r1=579743&r2=579744&view=diff ============================================================================== --- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java (original) +++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java Wed Sep 26 11:13:14 2007 @@ -201,8 +201,7 @@ tip.isMapTask()? TaskStatus.Phase.MAP: TaskStatus.Phase.STARTING, TaskStatus.State.FAILED, - trackerStatus.getHost(), trackerName, - myMetrics); + trackerName, myMetrics); } itr.remove(); } else { @@ -294,8 +293,7 @@ if (now - newProfile.getLastSeen() > TASKTRACKER_EXPIRY_INTERVAL) { // Remove completely updateTaskTrackerStatus(trackerName, null); - lostTaskTracker(leastRecent.getTrackerName(), - leastRecent.getHost()); + lostTaskTracker(leastRecent.getTrackerName()); } else { // Update time by inserting latest profile trackerExpiryQueue.add(newProfile); @@ -1242,7 +1240,7 @@ // If it's first contact, then clear out // any state hanging around if (seenBefore) { - lostTaskTracker(trackerName, trackerStatus.getHost()); + lostTaskTracker(trackerName); } } else { // If not first contact, there should be some record of the tracker @@ -1771,11 +1769,16 @@ if (failedFetchMaps != null) { for (String mapTaskId : failedFetchMaps) { TaskInProgress failedFetchMap = taskidToTIPMap.get(mapTaskId); + if (failedFetchMap != null) { + // Gather information about the map which has to be failed, if need be + String failedFetchTrackerName = getAssignedTracker(mapTaskId); + if (failedFetchTrackerName == null) { + failedFetchTrackerName = "Lost task tracker"; + } failedFetchMap.getJob().fetchFailureNotification(failedFetchMap, mapTaskId, - status.getHost(), - trackerName, + failedFetchTrackerName, myMetrics); } } @@ -1788,7 +1791,7 @@ * already been updated. Just process the contained tasks and any * jobs that might be affected. */ - void lostTaskTracker(String trackerName, String hostname) { + void lostTaskTracker(String trackerName) { LOG.info("Lost tracker '" + trackerName + "'"); Set lostTasks = trackerToTaskMap.get(trackerName); trackerToTaskMap.remove(trackerName); @@ -1805,12 +1808,11 @@ JobInProgress job = tip.getJob(); // if the job is done, we don't want to change anything if (job.getStatus().getRunState() == JobStatus.RUNNING) { - job.failedTask(tip, taskId, "Lost task tracker", + job.failedTask(tip, taskId, ("Lost task tracker: " + trackerName), (tip.isMapTask() ? TaskStatus.Phase.MAP : TaskStatus.Phase.REDUCE), - TaskStatus.State.KILLED, - hostname, trackerName, myMetrics); + TaskStatus.State.KILLED, trackerName, myMetrics); jobsWithFailures.add(job); } } else if (!tip.isMapTask() && tip.isComplete()) {