hadoop-hdfs-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From t...@apache.org
Subject svn commit: r1124364 - in /hadoop/hdfs/trunk: CHANGES.txt src/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestSaveNamespace.java
Date Wed, 18 May 2011 18:19:52 GMT
Author: todd
Date: Wed May 18 18:19:52 2011
New Revision: 1124364

URL: http://svn.apache.org/viewvc?rev=1124364&view=rev
Log:
HDFS-1921. saveNamespace can cause NN to be unable to come up on restart. Contributed by Matt
Foley.

Modified:
    hadoop/hdfs/trunk/CHANGES.txt
    hadoop/hdfs/trunk/src/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java
    hadoop/hdfs/trunk/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestSaveNamespace.java

Modified: hadoop/hdfs/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hdfs/trunk/CHANGES.txt?rev=1124364&r1=1124363&r2=1124364&view=diff
==============================================================================
--- hadoop/hdfs/trunk/CHANGES.txt (original)
+++ hadoop/hdfs/trunk/CHANGES.txt Wed May 18 18:19:52 2011
@@ -1012,6 +1012,9 @@ Release 0.22.0 - Unreleased
     HDFS-1505. saveNamespace appears to succeed even if all directories fail
     to save. (Aaron T. Myers via todd)
 
+    HDFS-1921. saveNamespace can cause NN to be unable to come up on restart
+    (Matt Foley via todd)
+
 Release 0.21.1 - Unreleased
     HDFS-1466. TestFcHdfsSymlink relies on /tmp/test not existing. (eli)
 

Modified: hadoop/hdfs/trunk/src/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java
URL: http://svn.apache.org/viewvc/hadoop/hdfs/trunk/src/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java?rev=1124364&r1=1124363&r2=1124364&view=diff
==============================================================================
--- hadoop/hdfs/trunk/src/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java (original)
+++ hadoop/hdfs/trunk/src/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java Wed May
18 18:19:52 2011
@@ -869,10 +869,19 @@ public class FSImage implements NNStorag
     for (Iterator<StorageDirectory> it
            = storage.dirIterator(NameNodeDirType.IMAGE); it.hasNext();) {
       StorageDirectory sd = it.next();
-      FSImageSaver saver = new FSImageSaver(sd, errorSDs);
-      Thread saveThread = new Thread(saver, saver.toString());
-      saveThreads.add(saveThread);
-      saveThread.start();
+      if (errorSDs.contains(sd)) {
+        continue;
+      }
+      try {
+        FSImageSaver saver = new FSImageSaver(sd, errorSDs);
+        Thread saveThread = new Thread(saver, saver.toString());
+        saveThreads.add(saveThread);
+        saveThread.start();
+      } catch (Exception e) {
+        LOG.error("Failed save to image directory " + sd.getRoot(), e);
+        errorSDs.add(sd);
+        continue;
+      }
     }
     waitForThreads(saveThreads);
     saveThreads.clear();
@@ -890,27 +899,41 @@ public class FSImage implements NNStorag
     for (Iterator<StorageDirectory> it
            = storage.dirIterator(NameNodeDirType.EDITS); it.hasNext();) {
       StorageDirectory sd = it.next();
+      if (errorSDs.contains(sd)) {
+        continue;
+      }
+
       // if this directory already stores the image and edits, then it was
       // already processed in the earlier loop.
       if (sd.getStorageDirType() == NameNodeDirType.IMAGE_AND_EDITS) {
         continue;
       }
 
-      FSImageSaver saver = new FSImageSaver(sd, errorSDs);
-      Thread saveThread = new Thread(saver, saver.toString());
-      saveThreads.add(saveThread);
-      saveThread.start();
+      try {
+        FSImageSaver saver = new FSImageSaver(sd, errorSDs);
+        Thread saveThread = new Thread(saver, saver.toString());
+        saveThreads.add(saveThread);
+        saveThread.start();
+      } catch (Exception e) {
+        LOG.error("Failed save to edits directory " + sd.getRoot(), e);
+        errorSDs.add(sd);
+        continue;
+      }
     }
     waitForThreads(saveThreads);
 
     // mv lastcheckpoint.tmp -> previous.checkpoint
     for (Iterator<StorageDirectory> it = storage.dirIterator(); it.hasNext();) {
       StorageDirectory sd = it.next();
+      if (errorSDs.contains(sd)) {
+        continue;
+      }
       try {
         storage.moveLastCheckpoint(sd);
       } catch(IOException ie) {
         LOG.error("Unable to move last checkpoint for " + sd.getRoot(), ie);
         errorSDs.add(sd);
+        continue;
       }
     }
     

Modified: hadoop/hdfs/trunk/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestSaveNamespace.java
URL: http://svn.apache.org/viewvc/hadoop/hdfs/trunk/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestSaveNamespace.java?rev=1124364&r1=1124363&r2=1124364&view=diff
==============================================================================
--- hadoop/hdfs/trunk/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestSaveNamespace.java
(original)
+++ hadoop/hdfs/trunk/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestSaveNamespace.java
Wed May 18 18:19:52 2011
@@ -263,9 +263,37 @@ public class TestSaveNamespace {
   public void testCrashWhileMoveLastCheckpoint() throws Exception {
     saveNamespaceWithInjectedFault(Fault.MOVE_LAST_CHECKPOINT);
   }
-  
+ 
+
+  /**
+   * Test case where savenamespace fails in all directories
+   * and then the NN shuts down. Here we should recover from the
+   * failed checkpoint by moving the directories back on next
+   * NN start. This is a regression test for HDFS-1921.
+   */
   @Test
   public void testFailedSaveNamespace() throws Exception {
+    doTestFailedSaveNamespace(false);
+  }
+
+  /**
+   * Test case where saveNamespace fails in all directories, but then
+   * the operator restores the directories and calls it again.
+   * This should leave the NN in a clean state for next start.
+   */
+  @Test
+  public void testFailedSaveNamespaceWithRecovery() throws Exception {
+    doTestFailedSaveNamespace(true);
+  }
+
+  /**
+   * Injects a failure on all storage directories while saving namespace.
+   *
+   * @param restoreStorageAfterFailure if true, will try to save again after
+   *   clearing the failure injection
+   */
+  public void doTestFailedSaveNamespace(boolean restoreStorageAfterFailure)
+  throws Exception {
     Configuration conf = getConf();
     NameNode.initMetrics(conf, NamenodeRole.ACTIVE);
     DFSTestUtil.formatNameNode(conf);
@@ -300,10 +328,12 @@ public class TestSaveNamespace {
       }
       
       // Ensure that, if storage dirs come back online, things work again.
-      Mockito.reset(spyImage);
-      spyStorage.setRestoreFailedStorage(true);
-      fsn.saveNamespace();
-      checkEditExists(fsn, 1);
+      if (restoreStorageAfterFailure) {
+        Mockito.reset(spyImage);
+        spyStorage.setRestoreFailedStorage(true);
+        fsn.saveNamespace();
+        checkEditExists(fsn, 1);
+      }
 
       // Now shut down and restart the NN
       originalImage.close();



Mime
View raw message