hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From hair...@apache.org
Subject svn commit: r732777 - in /hadoop/core/trunk: ./ src/hdfs/org/apache/hadoop/hdfs/server/namenode/ src/test/org/apache/hadoop/hdfs/ src/test/org/apache/hadoop/hdfs/server/namenode/
Date Thu, 08 Jan 2009 18:27:00 GMT
Author: hairong
Date: Thu Jan  8 10:26:59 2009
New Revision: 732777

URL: http://svn.apache.org/viewvc?rev=732777&view=rev
Log:
HADOOP-4910. NameNode should exclude corrupt replicas when choosing excessive replicas to
delete. Contributed by Hairong Kuang.

Added:
    hadoop/core/trunk/src/test/org/apache/hadoop/hdfs/server/namenode/TestOverReplicatedBlocks.java
Modified:
    hadoop/core/trunk/CHANGES.txt
    hadoop/core/trunk/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
    hadoop/core/trunk/src/test/org/apache/hadoop/hdfs/MiniDFSCluster.java
    hadoop/core/trunk/src/test/org/apache/hadoop/hdfs/TestDatanodeBlockScanner.java

Modified: hadoop/core/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/CHANGES.txt?rev=732777&r1=732776&r2=732777&view=diff
==============================================================================
--- hadoop/core/trunk/CHANGES.txt (original)
+++ hadoop/core/trunk/CHANGES.txt Thu Jan  8 10:26:59 2009
@@ -1693,6 +1693,8 @@
     HADOOP-4971. A long (unexpected) delay at datanodes could make subsequent
     block reports from many datanode at the same time. (Raghu Angadi)
     
+    HADOOP-4910. NameNode should exclude replicas when choosing excessive
+    replicas to delete to avoid data lose. (hairong)
 
 Release 0.18.2 - 2008-11-03
 

Modified: hadoop/core/trunk/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java?rev=732777&r1=732776&r2=732777&view=diff
==============================================================================
--- hadoop/core/trunk/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java (original)
+++ hadoop/core/trunk/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java Thu
Jan  8 10:26:59 2009
@@ -2997,13 +2997,17 @@
       delNodeHint = null;
     }
     Collection<DatanodeDescriptor> nonExcess = new ArrayList<DatanodeDescriptor>();
+    Collection<DatanodeDescriptor> corruptNodes = corruptReplicas.getNodes(block);
     for (Iterator<DatanodeDescriptor> it = blocksMap.nodeIterator(block); 
          it.hasNext();) {
       DatanodeDescriptor cur = it.next();
       Collection<Block> excessBlocks = excessReplicateMap.get(cur.getStorageID());
       if (excessBlocks == null || !excessBlocks.contains(block)) {
         if (!cur.isDecommissionInProgress() && !cur.isDecommissioned()) {
-          nonExcess.add(cur);
+          // exclude corrupt replicas
+          if (corruptNodes == null || !corruptNodes.contains(cur)) {
+            nonExcess.add(cur);
+          }
         }
       }
     }

Modified: hadoop/core/trunk/src/test/org/apache/hadoop/hdfs/MiniDFSCluster.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/test/org/apache/hadoop/hdfs/MiniDFSCluster.java?rev=732777&r1=732776&r2=732777&view=diff
==============================================================================
--- hadoop/core/trunk/src/test/org/apache/hadoop/hdfs/MiniDFSCluster.java (original)
+++ hadoop/core/trunk/src/test/org/apache/hadoop/hdfs/MiniDFSCluster.java Thu Jan  8 10:26:59
2009
@@ -638,7 +638,7 @@
   /*
    * Restart a particular datanode
    */
-  synchronized boolean restartDataNode(int i) throws IOException {
+  public synchronized boolean restartDataNode(int i) throws IOException {
     DataNodeProperties dnprop = stopDataNode(i);
     if (dnprop == null) {
       return false;

Modified: hadoop/core/trunk/src/test/org/apache/hadoop/hdfs/TestDatanodeBlockScanner.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/test/org/apache/hadoop/hdfs/TestDatanodeBlockScanner.java?rev=732777&r1=732776&r2=732777&view=diff
==============================================================================
--- hadoop/core/trunk/src/test/org/apache/hadoop/hdfs/TestDatanodeBlockScanner.java (original)
+++ hadoop/core/trunk/src/test/org/apache/hadoop/hdfs/TestDatanodeBlockScanner.java Thu Jan
 8 10:26:59 2009
@@ -144,7 +144,7 @@
     cluster.shutdown();
   }
 
-  boolean corruptReplica(String blockName, int replica) throws IOException {
+  public static boolean corruptReplica(String blockName, int replica) throws IOException
{
     Random random = new Random();
     File baseDir = new File(System.getProperty("test.build.data"), "dfs/data");
     boolean corrupted = false;
@@ -423,7 +423,7 @@
     }
   }
   
-  private void truncateReplica(String blockName, int dnIndex) throws IOException {
+  private static void truncateReplica(String blockName, int dnIndex) throws IOException {
     File baseDir = new File(System.getProperty("test.build.data"), "dfs/data");
     for (int i=dnIndex*2; i<dnIndex*2+2; i++) {
       File blockFile = new File(baseDir, "data" + (i+1)+ "/current/" + 
@@ -437,7 +437,7 @@
     }
   }
   
-  private void waitForBlockDeleted(String blockName, int dnIndex) 
+  private static void waitForBlockDeleted(String blockName, int dnIndex) 
   throws IOException, InterruptedException {
     File baseDir = new File(System.getProperty("test.build.data"), "dfs/data");
     File blockFile1 = new File(baseDir, "data" + (2*dnIndex+1)+ "/current/" + 

Added: hadoop/core/trunk/src/test/org/apache/hadoop/hdfs/server/namenode/TestOverReplicatedBlocks.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/test/org/apache/hadoop/hdfs/server/namenode/TestOverReplicatedBlocks.java?rev=732777&view=auto
==============================================================================
--- hadoop/core/trunk/src/test/org/apache/hadoop/hdfs/server/namenode/TestOverReplicatedBlocks.java
(added)
+++ hadoop/core/trunk/src/test/org/apache/hadoop/hdfs/server/namenode/TestOverReplicatedBlocks.java
Thu Jan  8 10:26:59 2009
@@ -0,0 +1,69 @@
+package org.apache.hadoop.hdfs.server.namenode;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.DFSTestUtil;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.TestDatanodeBlockScanner;
+import org.apache.hadoop.hdfs.protocol.Block;
+import org.apache.hadoop.hdfs.protocol.DatanodeID;
+
+import junit.framework.TestCase;
+
+public class TestOverReplicatedBlocks extends TestCase {
+  /** Test processOverReplicatedBlock can handle corrupt replicas fine.
+   * It make sure that it won't treat corrupt replicas as valid ones 
+   * thus prevents NN deleting valid replicas but keeping
+   * corrupt ones.
+   */
+  public void testProcesOverReplicateBlock() throws IOException {
+    Configuration conf = new Configuration();
+    conf.setLong("dfs.blockreport.intervalMsec", 1000L);
+    conf.set("dfs.replication.pending.timeout.sec", Integer.toString(2));
+    MiniDFSCluster cluster = new MiniDFSCluster(conf, 3, true, null);
+    FileSystem fs = cluster.getFileSystem();
+
+    try {
+      final Path fileName = new Path("/foo1");
+      DFSTestUtil.createFile(fs, fileName, 2, (short)3, 0L);
+      DFSTestUtil.waitReplication(fs, fileName, (short)3);
+      
+      // corrupt the block on datanode 0
+      Block block = DFSTestUtil.getFirstBlock(fs, fileName);
+      TestDatanodeBlockScanner.corruptReplica(block.getBlockName(), 0);
+      // remove block scanner log to trigger block scanning
+      File scanLog = new File(System.getProperty("test.build.data"),
+          "dfs/data/data1/current/dncp_block_verification.log.curr");
+      assertTrue(scanLog.delete()); 
+      // restart the datanode so the corrupt replica will be detected
+      cluster.restartDataNode(0);
+      DFSTestUtil.waitReplication(fs, fileName, (short)2);
+      
+      final DatanodeID corruptDataNode = 
+        cluster.getDataNodes().get(2).dnRegistration;
+      final FSNamesystem namesystem = FSNamesystem.getFSNamesystem();
+      synchronized (namesystem.heartbeats) {
+        // set live datanode's remaining space to be 0 
+        // so they will be chosen to be deleted when over-replication occurs
+        for (DatanodeDescriptor datanode : namesystem.heartbeats) {
+          if (!corruptDataNode.equals(datanode)) {
+            datanode.updateHeartbeat(100L, 100L, 0L, 0);
+          }
+        }
+        
+        // decrease the replication factor to 1; 
+        namesystem.setReplication(fileName.toString(), (short)1);
+
+        // corrupt one won't be chosen to be excess one
+        // without 4910 the number of live replicas would be 0: block gets lost
+        assertEquals(1, namesystem.countNodes(block).liveReplicas());
+      }
+    } finally {
+      cluster.shutdown();
+    }
+  }
+}



Mime
View raw message