hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From weic...@apache.org
Subject [hadoop] branch branch-2.8 updated: HDFS-10477. Stop decommission a rack of DataNodes caused NameNode fail over to standby. Contributed by yunjiong zhao, Wei-Chiu Chuang and star.
Date Tue, 16 Apr 2019 18:10:24 GMT
This is an automated email from the ASF dual-hosted git repository.

weichiu pushed a commit to branch branch-2.8
in repository https://gitbox.apache.org/repos/asf/hadoop.git


The following commit(s) were added to refs/heads/branch-2.8 by this push:
     new 93394c8  HDFS-10477. Stop decommission a rack of DataNodes caused NameNode fail over
to standby. Contributed by yunjiong zhao, Wei-Chiu Chuang and star.
93394c8 is described below

commit 93394c8f5541c49aa54ee26d45f7dc77f86ec02c
Author: Wei-Chiu Chuang <weichiu@apache.org>
AuthorDate: Tue Apr 16 11:09:16 2019 -0700

    HDFS-10477. Stop decommission a rack of DataNodes caused NameNode fail over to standby.
Contributed by yunjiong zhao, Wei-Chiu Chuang and star.
    
    Signed-off-by: Wei-Chiu Chuang <weichiu@apache.org>
---
 .../hdfs/server/blockmanagement/BlockManager.java  | 53 ++++++++++++++++------
 1 file changed, 39 insertions(+), 14 deletions(-)

diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
index e2bdfcb..4012f57 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
@@ -3430,21 +3430,46 @@ public class BlockManager implements BlockStatsMXBean {
     if (!isPopulatingReplQueues()) {
       return;
     }
-    final Iterator<BlockInfo> it = srcNode.getBlockIterator();
+
     int numOverReplicated = 0;
-    while(it.hasNext()) {
-      final BlockInfo block = it.next();
-      short expectedReplication = block.getReplication();
-      NumberReplicas num = countNodes(block);
-      int numCurrentReplica = num.liveReplicas();
-      if (numCurrentReplica > expectedReplication) {
-        // over-replicated block 
-        processOverReplicatedBlock(block, expectedReplication, null, null);
-        numOverReplicated++;
-      }
-    }
-    LOG.info("Invalidated " + numOverReplicated + " over-replicated blocks on " +
-        srcNode + " during recommissioning");
+    for (DatanodeStorageInfo datanodeStorageInfo : srcNode.getStorageInfos()) {
+      // the namesystem lock is released between iterations. Make sure the
+      // storage is not removed before continuing.
+      if (srcNode.getStorageInfo(datanodeStorageInfo.getStorageID()) == null) {
+        continue;
+      }
+      final Iterator<BlockInfo> it = datanodeStorageInfo.getBlockIterator();
+      while (it.hasNext()) {
+        final BlockInfo block = it.next();
+        if (block.isDeleted()) {
+          //Orphan block, will be handled eventually, skip
+          continue;
+        }
+        short expectedReplication = this.getExpectedReplicaNum(block);
+        NumberReplicas num = countNodes(block);
+        int numCurrentReplica = num.liveReplicas();
+        if (numCurrentReplica > expectedReplication) {
+          // over-replicated block
+          processOverReplicatedBlock(block, expectedReplication, null,
+              null);
+          numOverReplicated++;
+        }
+      }
+      // When called by tests like TestDefaultBlockPlacementPolicy.
+      // testPlacementWithLocalRackNodesDecommissioned, it is not protected by
+      // lock, only when called by DatanodeManager.refreshNodes have writeLock
+      if (namesystem.hasWriteLock()) {
+        namesystem.writeUnlock();
+        try {
+          Thread.sleep(1);
+        } catch (InterruptedException e) {
+          Thread.currentThread().interrupt();
+        }
+        namesystem.writeLock();
+      }
+    }
+    LOG.info("Invalidated " + numOverReplicated +
+        " over-replicated blocks on " + srcNode + " during recommissioning");
   }
 
   /**


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org


Mime
View raw message