hadoop-hdfs-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From szets...@apache.org
Subject svn commit: r1583020 - in /hadoop/common/branches/branch-2.4/hadoop-hdfs-project/hadoop-hdfs: ./ CHANGES.txt src/main/java/ src/main/java/org/apache/hadoop/hdfs/server/balancer/Balancer.java
Date Sat, 29 Mar 2014 16:24:56 GMT
Author: szetszwo
Date: Sat Mar 29 16:24:55 2014
New Revision: 1583020

URL: http://svn.apache.org/r1583020
Log:
svn merge -c 1583018 from trunk for HDFS-6166. Change Balancer socket read timeout to 20 minutes
and add 10 seconds delay after error.

Modified:
    hadoop/common/branches/branch-2.4/hadoop-hdfs-project/hadoop-hdfs/   (props changed)
    hadoop/common/branches/branch-2.4/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
    hadoop/common/branches/branch-2.4/hadoop-hdfs-project/hadoop-hdfs/src/main/java/   (props
changed)
    hadoop/common/branches/branch-2.4/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Balancer.java

Propchange: hadoop/common/branches/branch-2.4/hadoop-hdfs-project/hadoop-hdfs/
------------------------------------------------------------------------------
  Merged /hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs:r1583018

Modified: hadoop/common/branches/branch-2.4/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2.4/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt?rev=1583020&r1=1583019&r2=1583020&view=diff
==============================================================================
--- hadoop/common/branches/branch-2.4/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt (original)
+++ hadoop/common/branches/branch-2.4/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt Sat Mar
29 16:24:55 2014
@@ -444,6 +444,9 @@ Release 2.4.0 - UNRELEASED
     HDFS-6163. Fix a minor bug in the HA upgrade document. (Fengdong Yu via
     jing9)
 
+    HDFS-6166. Change Balancer socket read timeout to 20 minutes and add
+    10 seconds delay after error.  (Nathan Roberts via szetszwo)
+
   BREAKDOWN OF HDFS-5698 SUBTASKS AND RELATED JIRAS
 
     HDFS-5717. Save FSImage header in protobuf. (Haohui Mai via jing9)

Propchange: hadoop/common/branches/branch-2.4/hadoop-hdfs-project/hadoop-hdfs/src/main/java/
------------------------------------------------------------------------------
  Merged /hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java:r1583018

Modified: hadoop/common/branches/branch-2.4/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Balancer.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2.4/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Balancer.java?rev=1583020&r1=1583019&r2=1583020&view=diff
==============================================================================
--- hadoop/common/branches/branch-2.4/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Balancer.java
(original)
+++ hadoop/common/branches/branch-2.4/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Balancer.java
Sat Mar 29 16:24:55 2014
@@ -190,6 +190,8 @@ public class Balancer {
    */
   public static final int MAX_NUM_CONCURRENT_MOVES = 5;
   private static final int MAX_NO_PENDING_BLOCK_ITERATIONS = 5;
+  public static final long DELAY_AFTER_ERROR = 10 * 1000L; //10 seconds
+  public static final int BLOCK_MOVE_READ_TIMEOUT=20*60*1000; // 20 minutes
   
   private static final String USAGE = "Usage: java "
       + Balancer.class.getSimpleName()
@@ -337,7 +339,14 @@ public class Balancer {
         sock.connect(
             NetUtils.createSocketAddr(target.datanode.getXferAddr()),
             HdfsServerConstants.READ_TIMEOUT);
-        sock.setSoTimeout(HdfsServerConstants.READ_TIMEOUT);
+        /* Unfortunately we don't have a good way to know if the Datanode is
+         * taking a really long time to move a block, OR something has
+         * gone wrong and it's never going to finish. To deal with this 
+         * scenario, we set a long timeout (20 minutes) to avoid hanging
+         * the balancer indefinitely.
+         */
+        sock.setSoTimeout(BLOCK_MOVE_READ_TIMEOUT);
+
         sock.setKeepAlive(true);
         
         OutputStream unbufOut = sock.getOutputStream();
@@ -360,6 +369,13 @@ public class Balancer {
         LOG.info("Successfully moved " + this);
       } catch (IOException e) {
         LOG.warn("Failed to move " + this + ": " + e.getMessage());
+        /* proxy or target may have an issue, insert a small delay
+         * before using these nodes further. This avoids a potential storm
+         * of "threads quota exceeded" Warnings when the balancer
+         * gets out of sync with work going on in datanode.
+         */
+        proxySource.activateDelay(DELAY_AFTER_ERROR);
+        target.activateDelay(DELAY_AFTER_ERROR);
       } finally {
         IOUtils.closeStream(out);
         IOUtils.closeStream(in);
@@ -497,6 +513,7 @@ public class Balancer {
     final double utilization;
     final long maxSize2Move;
     private long scheduledSize = 0L;
+    protected long delayUntil = 0L;
     //  blocks being moved but not confirmed yet
     private final List<PendingBlockMove> pendingBlocks =
       new ArrayList<PendingBlockMove>(MAX_NUM_CONCURRENT_MOVES); 
@@ -573,6 +590,18 @@ public class Balancer {
     protected synchronized void setScheduledSize(long size){
       scheduledSize = size;
     }
+
+    synchronized private void activateDelay(long delta) {
+      delayUntil = Time.now() + delta;
+    }
+
+    synchronized private boolean isDelayActive() {
+      if (delayUntil == 0 || Time.now() > delayUntil){
+        delayUntil = 0;
+        return false;
+      }
+        return true;
+    }
     
     /* Check if the node can schedule more blocks to move */
     synchronized private boolean isPendingQNotFull() {
@@ -590,7 +619,7 @@ public class Balancer {
     /* Add a scheduled block move to the node */
     private synchronized boolean addPendingBlock(
         PendingBlockMove pendingBlock) {
-      if (isPendingQNotFull()) {
+      if (!isDelayActive() && isPendingQNotFull()) {
         return pendingBlocks.add(pendingBlock);
       }
       return false;



Mime
View raw message