hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kih...@apache.org
Subject hadoop git commit: HDFS-12070. Failed block recovery leaves files open indefinitely and at risk for data loss. Contributed by Kihwal Lee.
Date Mon, 26 Feb 2018 17:14:36 GMT
Repository: hadoop
Updated Branches:
  refs/heads/branch-2.9 627a32375 -> a6343ff80


HDFS-12070. Failed block recovery leaves files open indefinitely and at risk for data loss.
Contributed by Kihwal Lee.

(cherry picked from commit 4b43f2aa566322317a7f3163027bf5fd0a247207)


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/a6343ff8
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/a6343ff8
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/a6343ff8

Branch: refs/heads/branch-2.9
Commit: a6343ff808dcdabfa11b0f713a445cdb30474fa7
Parents: 627a323
Author: Kihwal Lee <kihwal@apache.org>
Authored: Mon Feb 26 10:59:09 2018 -0600
Committer: Kihwal Lee <kihwal@apache.org>
Committed: Mon Feb 26 10:59:47 2018 -0600

----------------------------------------------------------------------
 .../server/datanode/BlockRecoveryWorker.java    |  6 +--
 .../apache/hadoop/hdfs/TestLeaseRecovery.java   | 44 ++++++++++++++++++++
 2 files changed, 46 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/a6343ff8/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
----------------------------------------------------------------------
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
index aa36247..8d218ae 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
@@ -293,10 +293,8 @@ public class BlockRecoveryWorker {
         }
       }
 
-      // If any of the data-nodes failed, the recovery fails, because
-      // we never know the actual state of the replica on failed data-nodes.
-      // The recovery should be started over.
-      if (!failedList.isEmpty()) {
+      // Abort if all failed.
+      if (successList.isEmpty()) {
         StringBuilder b = new StringBuilder();
         for(DatanodeID id : failedList) {
           b.append("\n  " + id);

http://git-wip-us.apache.org/repos/asf/hadoop/blob/a6343ff8/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
----------------------------------------------------------------------
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
index d62194c..c82b47c 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
@@ -228,6 +228,50 @@ public class TestLeaseRecovery {
   }
 
   /**
+   * Block/lease recovery should be retried with failed nodes from the second
+   * stage removed to avoid perpetual recovery failures.
+   */
+  @Test
+  public void testBlockRecoveryRetryAfterFailedRecovery() throws Exception {
+    Configuration conf = new Configuration();
+    cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
+    Path file = new Path("/testBlockRecoveryRetryAfterFailedRecovery");
+    DistributedFileSystem dfs = cluster.getFileSystem();
+
+    // Create a file.
+    FSDataOutputStream out = dfs.create(file);
+    final int FILE_SIZE = 128 * 1024;
+    int count = 0;
+    while (count < FILE_SIZE) {
+      out.writeBytes("DE K9SUL");
+      count += 8;
+    }
+    out.hsync();
+
+    // Abort the original stream.
+    ((DFSOutputStream) out.getWrappedStream()).abort();
+
+    LocatedBlocks locations = cluster.getNameNodeRpc().getBlockLocations(
+        file.toString(), 0, count);
+    ExtendedBlock block = locations.get(0).getBlock();
+
+    // Finalize one replica to simulate a partial close failure.
+    cluster.getDataNodes().get(0).getFSDataset().finalizeBlock(block, false);
+    // Delete the meta file to simulate a rename/move failure.
+    cluster.deleteMeta(0, block);
+
+    // Try to recover the lease.
+    DistributedFileSystem newDfs = (DistributedFileSystem) FileSystem
+        .newInstance(cluster.getConfiguration(0));
+    count = 0;
+    while (count++ < 15 && !newDfs.recoverLease(file)) {
+      Thread.sleep(1000);
+    }
+    // The lease should have been recovered.
+    assertTrue("File should be closed", newDfs.recoverLease(file));
+  }
+
+  /**
    * Recover the lease on a file and append file from another client.
    */
   @Test


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org


Mime
View raw message