hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From s..@apache.org
Subject hadoop git commit: HDFS-9107. Prevent NN's unrecoverable death spiral after full GC (Daryn Sharp via Colin P. McCabe)
Date Wed, 30 Aug 2017 23:16:41 GMT
Repository: hadoop
Updated Branches:
  refs/heads/branch-2.7 70df729a1 -> c2350ec42


HDFS-9107. Prevent NN's unrecoverable death spiral after full GC (Daryn Sharp via Colin P.
McCabe)

(cherry-picked from 4e7c6a653f108d44589f84d78a03d92ee0e8a3c3)

Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/c2350ec4
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/c2350ec4
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/c2350ec4

Branch: refs/heads/branch-2.7
Commit: c2350ec42d9389fc6374a395574f4bdd646209c0
Parents: 70df729
Author: Colin Patrick Mccabe <cmccabe@cloudera.com>
Authored: Fri Sep 25 15:25:42 2015 -0700
Committer: Konstantin V Shvachko <shv@apache.org>
Committed: Wed Aug 30 16:15:03 2017 -0700

----------------------------------------------------------------------
 hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt     |  3 ++
 .../blockmanagement/HeartbeatManager.java       | 30 ++++++++++++++++++--
 .../blockmanagement/TestHeartbeatHandling.java  | 27 ++++++++++++++++++
 3 files changed, 58 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/c2350ec4/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
----------------------------------------------------------------------
diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
index 845a964..d1ebdcd 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
+++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
@@ -19,6 +19,9 @@ Release 2.7.5 - UNRELEASED
     HDFS-12157. Do fsyncDirectory(..) outside of FSDataset lock.
     (Vinayakumar B. via kihwal)
 
+    HDFS-9107. Prevent NN's unrecoverable death spiral after full GC (Daryn
+    Sharp via Colin P. McCabe)
+
 Release 2.7.4 - 2017-08-04
 
   INCOMPATIBLE CHANGES

http://git-wip-us.apache.org/repos/asf/hadoop/blob/c2350ec4/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
----------------------------------------------------------------------
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
index dfc3c60..93e9b32 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
@@ -19,6 +19,7 @@ package org.apache.hadoop.hdfs.server.blockmanagement;
 
 import java.util.ArrayList;
 import java.util.List;
+import java.util.concurrent.TimeUnit;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hdfs.DFSConfigKeys;
@@ -28,10 +29,13 @@ import org.apache.hadoop.hdfs.server.namenode.Namesystem;
 import org.apache.hadoop.hdfs.server.protocol.StorageReport;
 import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary;
 import org.apache.hadoop.util.Daemon;
+import org.apache.hadoop.util.StopWatch;
 import org.apache.hadoop.util.Time;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.google.common.annotations.VisibleForTesting;
+
 /**
  * Manage the heartbeats received from datanodes.
  * The datanode list and statistics are synchronized
@@ -56,8 +60,8 @@ class HeartbeatManager implements DatanodeStatistics {
   private final long heartbeatRecheckInterval;
   /** Heartbeat monitor thread */
   private final Daemon heartbeatThread = new Daemon(new Monitor());
+  private final StopWatch heartbeatStopWatch = new StopWatch();
 
-    
   final Namesystem namesystem;
   final BlockManager blockManager;
 
@@ -245,7 +249,18 @@ class HeartbeatManager implements DatanodeStatistics {
       stats.add(node);
     }
   }
-  
+
+  @VisibleForTesting
+  void restartHeartbeatStopWatch() {
+    heartbeatStopWatch.reset().start();
+  }
+
+  @VisibleForTesting
+  boolean shouldAbortHeartbeatCheck(long offset) {
+    long elapsed = heartbeatStopWatch.now(TimeUnit.MILLISECONDS);
+    return elapsed + offset > heartbeatRecheckInterval;
+  }
+
   /**
    * Check if there are any expired heartbeats, and if so,
    * whether any blocks have to be re-replicated.
@@ -292,6 +307,10 @@ class HeartbeatManager implements DatanodeStatistics {
       int numOfStaleStorages = 0;
       synchronized(this) {
         for (DatanodeDescriptor d : datanodes) {
+          // check if an excessive GC pause has occurred
+          if (shouldAbortHeartbeatCheck(0)) {
+            return;
+          }
           if (dead == null && dm.isDatanodeDead(d)) {
             stats.incrExpiredHeartbeats();
             dead = d;
@@ -360,6 +379,7 @@ class HeartbeatManager implements DatanodeStatistics {
     @Override
     public void run() {
       while(namesystem.isRunning()) {
+        restartHeartbeatStopWatch();
         try {
           final long now = Time.monotonicNow();
           if (lastHeartbeatCheck + heartbeatRecheckInterval < now) {
@@ -381,6 +401,12 @@ class HeartbeatManager implements DatanodeStatistics {
           Thread.sleep(5000);  // 5 seconds
         } catch (InterruptedException ie) {
         }
+        // avoid declaring nodes dead for another cycle if a GC pause lasts
+        // longer than the node recheck interval
+        if (shouldAbortHeartbeatCheck(-5000)) {
+          LOG.warn("Skipping next heartbeat scan due to excessive pause");
+          lastHeartbeatCheck = Time.monotonicNow();
+        }
       }
     }
   }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/c2350ec4/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestHeartbeatHandling.java
----------------------------------------------------------------------
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestHeartbeatHandling.java
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestHeartbeatHandling.java
index 6fc30ba..c68ff37 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestHeartbeatHandling.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestHeartbeatHandling.java
@@ -18,6 +18,8 @@
 package org.apache.hadoop.hdfs.server.blockmanagement;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
 
 import java.util.ArrayList;
 
@@ -33,6 +35,7 @@ import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState;
 import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
 import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
 import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
+import org.apache.hadoop.hdfs.server.namenode.Namesystem;
 import org.apache.hadoop.hdfs.server.protocol.BlockCommand;
 import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryCommand;
 import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
@@ -41,6 +44,7 @@ import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
 import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage;
 import org.apache.hadoop.util.Time;
 import org.junit.Test;
+import org.mockito.Mockito;
 
 /**
  * Test if FSNamesystem handles heartbeat right
@@ -241,4 +245,27 @@ public class TestHeartbeatHandling {
       cluster.shutdown();
     }
   }
+
+  @Test
+  public void testHeartbeatStopWatch() throws Exception {
+   Namesystem ns = Mockito.mock(Namesystem.class);
+   BlockManager bm = Mockito.mock(BlockManager.class);
+   Configuration conf = new Configuration();
+   long recheck = 2000;
+   conf.setLong(
+       DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY, recheck);
+   HeartbeatManager monitor = new HeartbeatManager(ns, bm, conf);
+   monitor.restartHeartbeatStopWatch();
+   assertFalse(monitor.shouldAbortHeartbeatCheck(0));
+   // sleep shorter than recheck and verify shouldn't abort
+   Thread.sleep(100);
+   assertFalse(monitor.shouldAbortHeartbeatCheck(0));
+   // sleep longer than recheck and verify should abort unless ignore delay
+   Thread.sleep(recheck);
+   assertTrue(monitor.shouldAbortHeartbeatCheck(0));
+   assertFalse(monitor.shouldAbortHeartbeatCheck(-recheck*3));
+   // ensure it resets properly
+   monitor.restartHeartbeatStopWatch();
+   assertFalse(monitor.shouldAbortHeartbeatCheck(0));
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org


Mime
View raw message