hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From szets...@apache.org
Subject svn commit: r1440259 - in /hadoop/common/branches/branch-1: CHANGES.txt src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSImage.java src/test/org/apache/hadoop/hdfs/server/namenode/TestNameNodeCorruptionRecovery.java
Date Wed, 30 Jan 2013 02:50:13 GMT
Author: szetszwo
Date: Wed Jan 30 02:50:13 2013
New Revision: 1440259

URL: http://svn.apache.org/viewvc?rev=1440259&view=rev
Log:
HDFS-4423. Checkpoint exception may cause fatal damage to fsimage.  Contributed by Chris Nauroth

Modified:
    hadoop/common/branches/branch-1/CHANGES.txt
    hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSImage.java
    hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/namenode/TestNameNodeCorruptionRecovery.java

Modified: hadoop/common/branches/branch-1/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1/CHANGES.txt?rev=1440259&r1=1440258&r2=1440259&view=diff
==============================================================================
--- hadoop/common/branches/branch-1/CHANGES.txt (original)
+++ hadoop/common/branches/branch-1/CHANGES.txt Wed Jan 30 02:50:13 2013
@@ -516,6 +516,9 @@ Release 1.1.2 - Unreleased
     MAPREDUCE-4888. Fixed NLineInputFormat one-off error which dropped data.
     (vinodkv via acmurthy) 
 
+    HDFS-4423. Checkpoint exception may cause fatal damage to fsimage.
+    (Chris Nauroth via szetszwo)
+
 Release 1.1.1 - 2012.11.18
 
   INCOMPATIBLE CHANGES

Modified: hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSImage.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSImage.java?rev=1440259&r1=1440258&r2=1440259&view=diff
==============================================================================
--- hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSImage.java
(original)
+++ hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSImage.java
Wed Jan 30 02:50:13 2013
@@ -834,11 +834,13 @@ public class FSImage extends Storage {
         + (FSNamesystem.now() - startTime)/1000 + " seconds.");
     
     // Load latest edits
-    if (latestNameCheckpointTime > latestEditsCheckpointTime)
+    if (latestNameCheckpointTime > latestEditsCheckpointTime) {
       // the image is already current, discard edits
       needToSave |= true;
-    else // latestNameCheckpointTime == latestEditsCheckpointTime
+      FSNamesystem.getFSNamesystem().dir.updateCountForINodeWithQuota();
+    } else { // latestNameCheckpointTime == latestEditsCheckpointTime
       needToSave |= (loadFSEdits(latestEditsSD, recovery) > 0);
+    }
     
     return needToSave;
   }

Modified: hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/namenode/TestNameNodeCorruptionRecovery.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/namenode/TestNameNodeCorruptionRecovery.java?rev=1440259&r1=1440258&r2=1440259&view=diff
==============================================================================
--- hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/namenode/TestNameNodeCorruptionRecovery.java
(original)
+++ hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/namenode/TestNameNodeCorruptionRecovery.java
Wed Jan 30 02:50:13 2013
@@ -19,14 +19,20 @@ package org.apache.hadoop.hdfs.server.na
 
 import static org.junit.Assert.*;
 
+import java.io.DataOutputStream;
 import java.io.File;
+import java.io.FileOutputStream;
 import java.io.IOException;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.server.namenode.FSImage.NameNodeDirType;
 import org.apache.hadoop.hdfs.server.namenode.FSImage.NameNodeFile;
+import org.apache.hadoop.io.IOUtils;
 import org.junit.After;
-import org.junit.Before;
 import org.junit.Test;
 
 /**
@@ -34,18 +40,17 @@ import org.junit.Test;
  * directories.
  */
 public class TestNameNodeCorruptionRecovery {
+
+  private static final Log LOG = LogFactory.getLog(
+    TestNameNodeCorruptionRecovery.class);
   
   private MiniDFSCluster cluster;
   
-  @Before
-  public void setUpCluster() throws IOException {
-    cluster = new MiniDFSCluster(new Configuration(), 0, true, null);
-    cluster.waitActive();
-  }
-  
   @After
   public void tearDownCluster() {
-    cluster.shutdown();
+    if (cluster != null) {
+      cluster.shutdown();
+    }
   }
 
   /**
@@ -54,13 +59,99 @@ public class TestNameNodeCorruptionRecov
    */
   @Test
   public void testFsTimeFileCorrupt() throws IOException, InterruptedException {
+    cluster = new MiniDFSCluster(new Configuration(), 0, true, null);
+    cluster.waitActive();
     assertEquals(cluster.getNameDirs().size(), 2);
     // Get the first fstime file and truncate it.
     truncateStorageDirFile(cluster, NameNodeFile.TIME, 0);
     // Make sure we can start up despite the fact the fstime file is corrupted.
     cluster.restartNameNode();
   }
-  
+
+  /**
+   * Tests that a cluster's image is not damaged if checkpoint fails after
+   * writing checkpoint time to the image directory but before writing checkpoint
+   * time to the edits directory.  This is a very rare failure scenario that can
+   * only occur if the namenode is configured with separate directories for image
+   * and edits.  This test simulates the failure by forcing the fstime file for
+   * edits to contain 0, so that it appears the checkpoint time for edits is less
+   * than the checkpoint time for image.
+   */
+  @Test
+  public void testEditsFsTimeLessThanImageFsTime() throws Exception {
+    // Create a cluster with separate directories for image and edits.
+    Configuration conf = new Configuration();
+    File testDir = new File(System.getProperty("test.build.data",
+      "build/test/data"), "dfs/");
+    conf.set("dfs.name.dir", new File(testDir, "name").getPath());
+    conf.set("dfs.name.edits.dir", new File(testDir, "edits").getPath());
+    cluster = new MiniDFSCluster(0, conf, 1, true, false, true, null, null, null,
+      null);
+    cluster.waitActive();
+
+    // Create several files to generate some edits.
+    createFile("one");
+    createFile("two");
+    createFile("three");
+    assertTrue(checkFileExists("one"));
+    assertTrue(checkFileExists("two"));
+    assertTrue(checkFileExists("three"));
+
+    // Restart to force a checkpoint.
+    cluster.restartNameNode();
+
+    // Shutdown so that we can safely modify the fstime file.
+    File[] editsFsTime = cluster.getNameNode().getFSImage().getFileNames(
+      NameNodeFile.TIME, NameNodeDirType.EDITS);
+    assertTrue("expected exactly one edits directory containing fstime file",
+      editsFsTime.length == 1);
+    cluster.shutdown();
+
+    // Write 0 into the fstime file for the edits directory.
+    FileOutputStream fos = null;
+    DataOutputStream dos = null;
+    try {
+      fos = new FileOutputStream(editsFsTime[0]);
+      dos = new DataOutputStream(fos);
+      dos.writeLong(0);
+    } finally {
+      IOUtils.cleanup(LOG, dos, fos);
+    }
+
+    // Restart to force another checkpoint, which should discard the old edits.
+    cluster = new MiniDFSCluster(0, conf, 1, false, false, true, null, null,
+      null, null);
+    cluster.waitActive();
+
+    // Restart one more time.  If all of the prior checkpoints worked correctly,
+    // then we expect to load the image successfully and find the files.
+    cluster.restartNameNode();
+    assertTrue(checkFileExists("one"));
+    assertTrue(checkFileExists("two"));
+    assertTrue(checkFileExists("three"));
+  }
+
+  /**
+   * Checks that a file exists in the cluster.
+   * 
+   * @param file String name of file to check
+   * @return boolean true if file exists
+   * @throws IOException thrown if there is an I/O error
+   */
+  private boolean checkFileExists(String file) throws IOException {
+    return cluster.getFileSystem().exists(new Path(file));
+  }
+
+  /**
+   * Creates a new, empty file in the cluster.
+   * 
+   * @param file String name of file to create
+   * @throws IOException thrown if there is an I/O error
+   */
+  private void createFile(String file) throws IOException {
+    cluster.getFileSystem().create(new Path(file)).close();
+  }
+
   private static void truncateStorageDirFile(MiniDFSCluster cluster,
       NameNodeFile f, int storageDirIndex) throws IOException {
     File currentDir = cluster.getNameNode().getFSImage()
@@ -70,4 +161,4 @@ public class TestNameNodeCorruptionRecov
     assertTrue(nameNodeFile.delete());
     assertTrue(nameNodeFile.createNewFile());
   }
-}
\ No newline at end of file
+}



Mime
View raw message