Mailing-List: contact core-commits-help@hadoop.apache.org; run by ezmlm
Precedence: bulk
Reply-To: core-dev@hadoop.apache.org
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Subject: svn commit: r701398 - in /hadoop/core/trunk: CHANGES.txt
 src/mapred/org/apache/hadoop/mapred/ReduceTask.java
Date: Fri, 03 Oct 2008 15:07:34 -0000
To: core-commits@hadoop.apache.org
From: acmurthy@apache.org
Message-Id: <20081003150735.20E2A238889E@eris.apache.org>

Author: acmurthy
Date: Fri Oct  3 08:07:34 2008
New Revision: 701398

URL: http://svn.apache.org/viewvc?rev=701398&view=rev
Log:
HADOOP-4246. Ensure we have the correct lower bound on the number of retries for fetching map-outputs; also fixed the case where the reducer automatically kills on too many unique map-outputs could not be fetched for small jobs. Contributed by Amareshwari Sri Ramadasu.

Modified:
    hadoop/core/trunk/CHANGES.txt
    hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/ReduceTask.java

Modified: hadoop/core/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/CHANGES.txt?rev=701398&r1=701397&r2=701398&view=diff
==============================================================================
--- hadoop/core/trunk/CHANGES.txt (original)
+++ hadoop/core/trunk/CHANGES.txt Fri Oct  3 08:07:34 2008
@@ -831,6 +831,11 @@
     HADOOP-4319. fuse-dfs dfs_read function returns as many bytes as it is
     told to read unlesss end-of-file is reached.  (Pete Wyckoff via dhruba)
 
+    HADOOP-4246. Ensure we have the correct lower bound on the number of
+    retries for fetching map-outputs; also fixed the case where the reducer
+    automatically kills on too many unique map-outputs could not be fetched
+    for small jobs. (Amareshwari Sri Ramadasu via acmurthy)  
+    
 Release 0.18.2 - Unreleased
 
   BUG FIXES

Modified: hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/ReduceTask.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/ReduceTask.java?rev=701398&r1=701397&r2=701398&view=diff
==============================================================================
--- hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/ReduceTask.java (original)
+++ hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/ReduceTask.java Fri Oct  3 08:07:34 2008
@@ -615,13 +615,18 @@
      * Maximum percent of shuffle execution time required to keep the reducer alive.
      */
     private static final float MAX_ALLOWED_STALL_TIME_PERCENT = 0.5f;
+    
+    /**
+     * Minimum number of map fetch retries.
+     */
+    private static final int MIN_FETCH_RETRIES_PER_MAP = 2;
 
     /**
      * Maximum no. of unique maps from which we failed to fetch map-outputs
      * even after {@link #maxFetchRetriesPerMap} retries; after this the
      * reduce task is failed.
      */
-    private static final int MAX_FAILED_UNIQUE_FETCHES = 5;
+    private int maxFailedUniqueFetches = 5;
 
     /**
      * The maps from which we fail to fetch map-outputs 
@@ -1553,8 +1558,10 @@
       // the order is 4,8,16,32,64,128. sum of which is 252 sec = 4.2 min
       
       // optimizing for the base 2
-      this.maxFetchRetriesPerMap = getClosestPowerOf2((this.maxBackoff * 1000 
-                                                       / BACKOFF_INIT) + 1); 
+      this.maxFetchRetriesPerMap = Math.max(MIN_FETCH_RETRIES_PER_MAP, 
+             getClosestPowerOf2((this.maxBackoff * 1000 / BACKOFF_INIT) + 1));
+      this.maxFailedUniqueFetches = Math.min(numMaps, 
+                                             this.maxFailedUniqueFetches);
       this.maxInMemOutputs = conf.getInt("mapred.inmem.merge.threshold", 1000);
       this.maxInMemCopyPer =
         conf.getFloat("mapred.job.shuffle.merge.percent", 0.66f);
@@ -1909,7 +1916,8 @@
                      >= MAX_ALLOWED_STALL_TIME_PERCENT);
                 
                 // kill if not healthy and has insufficient progress
-                if ((fetchFailedMaps.size() >= MAX_FAILED_UNIQUE_FETCHES)
+                if ((fetchFailedMaps.size() >= maxFailedUniqueFetches ||
+                     fetchFailedMaps.size() == (numMaps - copiedMapOutputs.size()))
                     && !reducerHealthy 
                     && (!reducerProgressedEnough || reducerStalled)) { 
                   LOG.fatal("Shuffle failed with too many fetch failures " + 
@@ -2249,8 +2257,8 @@
             if (duration > maxMapRuntime) {
               maxMapRuntime = duration; 
               // adjust max-fetch-retries based on max-map-run-time
-              maxFetchRetriesPerMap = 
-                  getClosestPowerOf2((maxMapRuntime / BACKOFF_INIT) + 1);
+              maxFetchRetriesPerMap = Math.max(MIN_FETCH_RETRIES_PER_MAP, 
+                getClosestPowerOf2((maxMapRuntime / BACKOFF_INIT) + 1));
             }
             URL mapOutputLocation = new URL(event.getTaskTrackerHttp() + 
                                     "/mapOutput?job=" + taskId.getJobID() +