Return-Path: X-Original-To: apmail-hadoop-mapreduce-commits-archive@minotaur.apache.org Delivered-To: apmail-hadoop-mapreduce-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 2E8F4D791 for ; Mon, 12 Nov 2012 19:03:01 +0000 (UTC) Received: (qmail 56751 invoked by uid 500); 12 Nov 2012 19:03:01 -0000 Delivered-To: apmail-hadoop-mapreduce-commits-archive@hadoop.apache.org Received: (qmail 56692 invoked by uid 500); 12 Nov 2012 19:03:01 -0000 Mailing-List: contact mapreduce-commits-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: mapreduce-dev@hadoop.apache.org Delivered-To: mailing list mapreduce-commits@hadoop.apache.org Received: (qmail 56679 invoked by uid 99); 12 Nov 2012 19:03:00 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 12 Nov 2012 19:03:00 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 12 Nov 2012 19:02:57 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 2415223888E4; Mon, 12 Nov 2012 19:02:36 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1408411 - in /hadoop/common/branches/branch-0.23/hadoop-mapreduce-project: ./ hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/job/impl/ hadoop-mapreduce-client/hadoop-mapreduce-client-ap... Date: Mon, 12 Nov 2012 19:02:34 -0000 To: mapreduce-commits@hadoop.apache.org From: bobby@apache.org X-Mailer: svnmailer-1.0.8-patched Message-Id: <20121112190236.2415223888E4@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: bobby Date: Mon Nov 12 19:02:32 2012 New Revision: 1408411 URL: http://svn.apache.org/viewvc?rev=1408411&view=rev Log: MAPREDUCE-4425. Speculation + Fetch failures can lead to a hung job (jlowe via bobby) Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/CHANGES.txt hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/job/impl/TaskImpl.java hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/v2/app/job/impl/TestTaskImpl.java Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/CHANGES.txt?rev=1408411&r1=1408410&r2=1408411&view=diff ============================================================================== --- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/CHANGES.txt (original) +++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/CHANGES.txt Mon Nov 12 19:02:32 2012 @@ -90,6 +90,9 @@ Release 0.23.5 - UNRELEASED MAPREDUCE-4751. AM stuck in KILL_WAIT for days (vinodkv via bobby) MAPREDUCE-4787. TestJobMonitorAndPrint is broken (Rob Parker via bobby) + + MAPREDUCE-4425. Speculation + Fetch failures can lead to a hung job (jlowe + via bobby) Release 0.23.4 - UNRELEASED Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/job/impl/TaskImpl.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/job/impl/TaskImpl.java?rev=1408411&r1=1408410&r2=1408411&view=diff ============================================================================== --- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/job/impl/TaskImpl.java (original) +++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/job/impl/TaskImpl.java Mon Nov 12 19:02:32 2012 @@ -212,15 +212,17 @@ public abstract class TaskImpl implement .addTransition(TaskStateInternal.SUCCEEDED, //only possible for map tasks EnumSet.of(TaskStateInternal.SCHEDULED, TaskStateInternal.SUCCEEDED, TaskStateInternal.FAILED), TaskEventType.T_ATTEMPT_FAILED, new MapRetroactiveFailureTransition()) + .addTransition(TaskStateInternal.SUCCEEDED, TaskStateInternal.SUCCEEDED, + EnumSet.of(TaskEventType.T_ATTEMPT_KILLED, + TaskEventType.T_ATTEMPT_SUCCEEDED), + new AttemptCompletedAtSucceededTransition()) // Ignore-able transitions. .addTransition( TaskStateInternal.SUCCEEDED, TaskStateInternal.SUCCEEDED, EnumSet.of(TaskEventType.T_KILL, TaskEventType.T_ADD_SPEC_ATTEMPT, TaskEventType.T_ATTEMPT_COMMIT_PENDING, - TaskEventType.T_ATTEMPT_LAUNCHED, - TaskEventType.T_ATTEMPT_KILLED, - TaskEventType.T_ATTEMPT_SUCCEEDED)) + TaskEventType.T_ATTEMPT_LAUNCHED)) // Transitions from FAILED state .addTransition(TaskStateInternal.FAILED, TaskStateInternal.FAILED, @@ -964,6 +966,8 @@ public abstract class TaskImpl implement !castEvent.getTaskAttemptID().equals(task.successfulAttempt)) { // don't allow a different task attempt to override a previous // succeeded state + task.finishedAttempts.add(castEvent.getTaskAttemptID()); + task.inProgressAttempts.remove(castEvent.getTaskAttemptID()); return TaskStateInternal.SUCCEEDED; } @@ -993,6 +997,16 @@ public abstract class TaskImpl implement } } + private static class AttemptCompletedAtSucceededTransition + implements SingleArcTransition { + @Override + public void transition(TaskImpl task, TaskEvent event) { + TaskTAttemptEvent castEvent = (TaskTAttemptEvent) event; + task.finishedAttempts.add(castEvent.getTaskAttemptID()); + task.inProgressAttempts.remove(castEvent.getTaskAttemptID()); + } + } + private static class KillNewTransition implements SingleArcTransition { @Override Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/v2/app/job/impl/TestTaskImpl.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/v2/app/job/impl/TestTaskImpl.java?rev=1408411&r1=1408410&r2=1408411&view=diff ============================================================================== --- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/v2/app/job/impl/TestTaskImpl.java (original) +++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/v2/app/job/impl/TestTaskImpl.java Mon Nov 12 19:02:32 2012 @@ -140,7 +140,6 @@ public class TestTaskImpl { private float progress = 0; private TaskAttemptState state = TaskAttemptState.NEW; - private TaskAttemptId attemptId; public MockTaskAttemptImpl(TaskId taskId, int id, EventHandler eventHandler, TaskAttemptListener taskAttemptListener, Path jobFile, int partition, @@ -150,13 +149,10 @@ public class TestTaskImpl { AppContext appContext) { super(taskId, id, eventHandler, taskAttemptListener, jobFile, partition, conf, dataLocations, committer, jobToken, credentials, clock, appContext); - attemptId = Records.newRecord(TaskAttemptId.class); - attemptId.setId(id); - attemptId.setTaskId(taskId); } public TaskAttemptId getAttemptId() { - return attemptId; + return getID(); } @Override @@ -522,4 +518,46 @@ public class TestTaskImpl { public void testCommitAfterSucceeds() { runSpeculativeTaskAttemptSucceeds(TaskEventType.T_ATTEMPT_COMMIT_PENDING); } + + @Test + public void testSpeculativeMapFetchFailure() { + // Setup a scenario where speculative task wins, first attempt killed + runSpeculativeTaskAttemptSucceeds(TaskEventType.T_ATTEMPT_KILLED); + assertEquals(2, taskAttempts.size()); + + // speculative attempt retroactively fails from fetch failures + mockTask.handle(new TaskTAttemptEvent(taskAttempts.get(1).getAttemptId(), + TaskEventType.T_ATTEMPT_FAILED)); + + assertTaskScheduledState(); + assertEquals(3, taskAttempts.size()); + } + + @Test + public void testSpeculativeMapMultipleSucceedFetchFailure() { + // Setup a scenario where speculative task wins, first attempt succeeds + runSpeculativeTaskAttemptSucceeds(TaskEventType.T_ATTEMPT_SUCCEEDED); + assertEquals(2, taskAttempts.size()); + + // speculative attempt retroactively fails from fetch failures + mockTask.handle(new TaskTAttemptEvent(taskAttempts.get(1).getAttemptId(), + TaskEventType.T_ATTEMPT_FAILED)); + + assertTaskScheduledState(); + assertEquals(3, taskAttempts.size()); + } + + @Test + public void testSpeculativeMapFailedFetchFailure() { + // Setup a scenario where speculative task wins, first attempt succeeds + runSpeculativeTaskAttemptSucceeds(TaskEventType.T_ATTEMPT_FAILED); + assertEquals(2, taskAttempts.size()); + + // speculative attempt retroactively fails from fetch failures + mockTask.handle(new TaskTAttemptEvent(taskAttempts.get(1).getAttemptId(), + TaskEventType.T_ATTEMPT_FAILED)); + + assertTaskScheduledState(); + assertEquals(3, taskAttempts.size()); + } }