hadoop-hdfs-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ma...@apache.org
Subject svn commit: r1126282 - in /hadoop/hdfs/branches/branch-0.22: CHANGES.txt src/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestSaveNamespace.java
Date Mon, 23 May 2011 00:24:44 GMT
Author: mattf
Date: Mon May 23 00:24:44 2011
New Revision: 1126282

URL: http://svn.apache.org/viewvc?rev=1126282&view=rev
Log:
HDFS-1921. saveNamespace can cause NN to be unable to come up on restart.

Modified:
    hadoop/hdfs/branches/branch-0.22/CHANGES.txt
    hadoop/hdfs/branches/branch-0.22/src/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java
    hadoop/hdfs/branches/branch-0.22/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestSaveNamespace.java

Modified: hadoop/hdfs/branches/branch-0.22/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hdfs/branches/branch-0.22/CHANGES.txt?rev=1126282&r1=1126281&r2=1126282&view=diff
==============================================================================
--- hadoop/hdfs/branches/branch-0.22/CHANGES.txt (original)
+++ hadoop/hdfs/branches/branch-0.22/CHANGES.txt Mon May 23 00:24:44 2011
@@ -8,6 +8,9 @@ Release 0.22.0 - Unreleased
 
   NEW FEATURES
 
+    HDFS-1921. saveNamespace can cause NN to be unable to come up on restart.
+    (Matt Foley)
+
     HDFS-992. Re-factor block access token implementation to conform to the 
     generic Token interface in Common (Kan Zhang and Jitendra Pandey via 
     jghoman)

Modified: hadoop/hdfs/branches/branch-0.22/src/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java
URL: http://svn.apache.org/viewvc/hadoop/hdfs/branches/branch-0.22/src/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java?rev=1126282&r1=1126281&r2=1126282&view=diff
==============================================================================
--- hadoop/hdfs/branches/branch-0.22/src/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java
(original)
+++ hadoop/hdfs/branches/branch-0.22/src/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java
Mon May 23 00:24:44 2011
@@ -1201,10 +1201,19 @@ public class FSImage extends Storage {
     for (Iterator<StorageDirectory> it = dirIterator(NameNodeDirType.IMAGE);
                                                               it.hasNext();) {
       StorageDirectory sd = it.next();
-      FSImageSaver saver = new FSImageSaver(sd, errorSDs);
-      Thread saveThread = new Thread(saver, saver.toString());
-      saveThreads.add(saveThread);
-      saveThread.start();
+      if (errorSDs.contains(sd)) {
+        continue;
+      }
+      try {
+        FSImageSaver saver = new FSImageSaver(sd, errorSDs);
+        Thread saveThread = new Thread(saver, saver.toString());
+        saveThreads.add(saveThread);
+        saveThread.start();
+      } catch (Exception e) {
+        LOG.error("Failed save to image directory " + sd.getRoot(), e);
+        errorSDs.add(sd);
+        continue;
+      }
     }
     waitForThreads(saveThreads);
     saveThreads.clear();
@@ -1222,21 +1231,34 @@ public class FSImage extends Storage {
     for (Iterator<StorageDirectory> it = dirIterator(NameNodeDirType.EDITS);
                                                               it.hasNext();) {
       final StorageDirectory sd = it.next();
-      FSImageSaver saver = new FSImageSaver(sd, errorSDs);
-      Thread saveThread = new Thread(saver, saver.toString());
-      saveThreads.add(saveThread);
-      saveThread.start();
+      if (errorSDs.contains(sd)) {
+        continue;
+      }
+      try {
+        FSImageSaver saver = new FSImageSaver(sd, errorSDs);
+        Thread saveThread = new Thread(saver, saver.toString());
+        saveThreads.add(saveThread);
+        saveThread.start();
+      } catch (Exception e) {
+        LOG.error("Failed save to edits directory " + sd.getRoot(), e);
+        errorSDs.add(sd);
+        continue;
+      }
     }
     waitForThreads(saveThreads);
 
     // mv lastcheckpoint.tmp -> previous.checkpoint
     for (Iterator<StorageDirectory> it = dirIterator(); it.hasNext();) {
       StorageDirectory sd = it.next();
+      if (errorSDs.contains(sd)) {
+        continue;
+      }
       try {
         moveLastCheckpoint(sd);
       } catch(IOException ie) {
         LOG.error("Unable to move last checkpoint for " + sd.getRoot(), ie);
         errorSDs.add(sd);
+        continue;
       }
     }
     

Modified: hadoop/hdfs/branches/branch-0.22/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestSaveNamespace.java
URL: http://svn.apache.org/viewvc/hadoop/hdfs/branches/branch-0.22/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestSaveNamespace.java?rev=1126282&r1=1126281&r2=1126282&view=diff
==============================================================================
--- hadoop/hdfs/branches/branch-0.22/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestSaveNamespace.java
(original)
+++ hadoop/hdfs/branches/branch-0.22/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestSaveNamespace.java
Mon May 23 00:24:44 2011
@@ -158,8 +158,35 @@ public class TestSaveNamespace {
     saveNamespaceWithInjectedFault(Fault.MOVE_LAST_CHECKPOINT);
   }
 
+  /**
+   * Test case where savenamespace fails in all directories
+   * and then the NN shuts down. Here we should recover from the
+   * failed checkpoint by moving the directories back on next
+   * NN start. This is a regression test for HDFS-1921.
+   */
   @Test
   public void testFailedSaveNamespace() throws Exception {
+    doTestFailedSaveNamespace(false);
+  }
+
+  /**
+   * Test case where saveNamespace fails in all directories, but then
+   * the operator restores the directories and calls it again.
+   * This should leave the NN in a clean state for next start.
+   */
+  @Test
+  public void testFailedSaveNamespaceWithRecovery() throws Exception {
+    doTestFailedSaveNamespace(true);
+  }
+
+  /**
+   * Injects a failure on all storage directories while saving namespace.
+   *
+   * @param restoreStorageAfterFailure if true, will try to save again after
+   *   clearing the failure injection
+   */
+  public void doTestFailedSaveNamespace(boolean restoreStorageAfterFailure)
+  throws Exception {
     Configuration conf = getConf();
     NameNode.initMetrics(conf, NamenodeRole.ACTIVE);
     NameNode.format(conf);
@@ -190,11 +217,13 @@ public class TestSaveNamespace {
       }
 
       // Ensure that, if storage dirs come back online, things work again.
-      Mockito.reset(spyImage);
-      spyImage.setRestoreFailedStorage(true);
-      spyImage.attemptRestoreRemovedStorage();
-      fsn.saveNamespace();
-      checkEditExists(fsn, 1);
+      if (restoreStorageAfterFailure) {
+        Mockito.reset(spyImage);
+        spyImage.setRestoreFailedStorage(true);
+        spyImage.attemptRestoreRemovedStorage();
+        fsn.saveNamespace();
+        checkEditExists(fsn, 1);
+      }
 
       // Now shut down and restart the NN
       originalImage.close();



Mime
View raw message