hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From szets...@apache.org
Subject svn commit: r1368353 - in /hadoop/common/branches/branch-1: ./ src/hdfs/ src/hdfs/org/apache/hadoop/hdfs/ src/hdfs/org/apache/hadoop/hdfs/server/namenode/ src/test/org/apache/hadoop/hdfs/
Date Thu, 02 Aug 2012 05:22:43 GMT
Author: szetszwo
Date: Thu Aug  2 05:22:43 2012
New Revision: 1368353

URL: http://svn.apache.org/viewvc?rev=1368353&view=rev
Log:
HDFS-528. Backport: Add ability for safemode to wait for a minimum number of live datanodes.


Modified:
    hadoop/common/branches/branch-1/CHANGES.txt
    hadoop/common/branches/branch-1/src/hdfs/hdfs-default.xml
    hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/DFSConfigKeys.java
    hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
    hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/TestSafeMode.java

Modified: hadoop/common/branches/branch-1/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1/CHANGES.txt?rev=1368353&r1=1368352&r2=1368353&view=diff
==============================================================================
--- hadoop/common/branches/branch-1/CHANGES.txt (original)
+++ hadoop/common/branches/branch-1/CHANGES.txt Thu Aug  2 05:22:43 2012
@@ -19,6 +19,9 @@ Release 1.2.0 - unreleased
 
     MAPREDUCE-987. Exposing MiniDFS and MiniMR clusters as a single process command-line
(philip and ahmed via tucu)
 
+    HDFS-528. Backport: Add ability for safemode to wait for a minimum number
+    of live datanodes.  (szetszwo)
+
   IMPROVEMENTS
 
     HDFS-3515. Port HDFS-1457 to branch-1. (eli)

Modified: hadoop/common/branches/branch-1/src/hdfs/hdfs-default.xml
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1/src/hdfs/hdfs-default.xml?rev=1368353&r1=1368352&r2=1368353&view=diff
==============================================================================
--- hadoop/common/branches/branch-1/src/hdfs/hdfs-default.xml (original)
+++ hadoop/common/branches/branch-1/src/hdfs/hdfs-default.xml Thu Aug  2 05:22:43 2012
@@ -304,9 +304,24 @@ creations/deletions), or "all".</descrip
   <description>
     Specifies the percentage of blocks that should satisfy 
     the minimal replication requirement defined by dfs.replication.min.
-    Values less than or equal to 0 mean not to start in safe mode.
+    Values less than or equal to 0 mean not to wait for any particular
+    percentage of blocks before exiting safemode.
     Values greater than 1 will make safe mode permanent.
   </description>
+ </property>
+ 
+<property>
+  <name>dfs.namenode.safemode.min.datanodes</name>
+  <value>0</value>
+  <description>
+    Specifies the number of datanodes that must be considered alive
+    before the name node exits safemode.
+    Values less than or equal to 0 mean not to take the number of live
+    datanodes into account when deciding whether to remain in safe mode
+    during startup.
+    Values greater than the number of datanodes in the cluster
+    will make safe mode permanent.
+  </description>
 </property>
 
 <property>

Modified: hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/DFSConfigKeys.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/DFSConfigKeys.java?rev=1368353&r1=1368352&r2=1368353&view=diff
==============================================================================
--- hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/DFSConfigKeys.java (original)
+++ hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/DFSConfigKeys.java Thu
Aug  2 05:22:43 2012
@@ -72,6 +72,8 @@ public class DFSConfigKeys extends Commo
   public static final int     DFS_NAMENODE_SAFEMODE_EXTENSION_DEFAULT = 30000;
   public static final String  DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY = "dfs.namenode.safemode.threshold-pct";
   public static final float   DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT = 0.999f;
+  public static final String  DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY = "dfs.namenode.safemode.min.datanodes";
+  public static final int     DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT = 0;
   public static final String  DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_KEY = "dfs.namenode.secondary.http-address";
   public static final String  DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_DEFAULT = "0.0.0.0:50090";
   public static final String  DFS_NAMENODE_CHECKPOINT_PERIOD_KEY = "dfs.namenode.checkpoint.period";

Modified: hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java?rev=1368353&r1=1368352&r2=1368353&view=diff
==============================================================================
--- hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
(original)
+++ hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
Thu Aug  2 05:22:43 2012
@@ -2532,6 +2532,10 @@ public class FSNamesystem implements FSC
       // no need to update its timestamp
       // because its is done when the descriptor is created
     }
+
+    if (safeMode != null) {
+      safeMode.checkMode();
+    }
     return;
   }
     
@@ -3321,6 +3325,10 @@ public class FSNamesystem implements FSC
     }
     unprotectedRemoveDatanode(nodeInfo);
     clusterMap.remove(nodeInfo);
+
+    if (safeMode != null) {
+      safeMode.checkMode();
+    }
   }
 
   void unprotectedRemoveDatanode(DatanodeDescriptor nodeDescr) {
@@ -4199,6 +4207,10 @@ public class FSNamesystem implements FSC
     }
   }
 
+  int getNumLiveDataNodes() {
+    return getNumberOfDatanodes(DatanodeReportType.LIVE);
+  }
+
   int getNumberOfDatanodes(DatanodeReportType type) {
     return getDatanodeListForReport(type).size(); 
   }
@@ -4733,6 +4745,8 @@ public class FSNamesystem implements FSC
     // configuration fields
     /** Safe mode threshold condition %.*/
     private double threshold;
+    /** Safe mode minimum number of datanodes alive */
+    private int datanodeThreshold;
     /** Safe mode extension after the threshold. */
     private int extension;
     /** Min replication required by safe mode. */
@@ -4760,6 +4774,9 @@ public class FSNamesystem implements FSC
      */
     SafeModeInfo(Configuration conf) {
       this.threshold = conf.getFloat("dfs.safemode.threshold.pct", 0.95f);
+      this.datanodeThreshold = conf.getInt(
+          DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY,
+          DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT);
       this.extension = conf.getInt("dfs.safemode.extension", 0);
       this.safeReplication = conf.getInt("dfs.replication.min", 1);
       this.blockTotal = 0; 
@@ -4776,6 +4793,7 @@ public class FSNamesystem implements FSC
      */
     private SafeModeInfo() {
       this.threshold = 1.5f;  // this threshold can never be reached
+      this.datanodeThreshold = Integer.MAX_VALUE;
       this.extension = Integer.MAX_VALUE;
       this.safeReplication = Short.MAX_VALUE + 1; // more than maxReplication
       this.blockTotal = -1;
@@ -4874,7 +4892,8 @@ public class FSNamesystem implements FSC
      * if DFS is empty or {@link #threshold} == 0
      */
     boolean needEnter() {
-      return getSafeBlockRatio() < threshold;
+      return getSafeBlockRatio() < threshold ||
+          getNumLiveDataNodes() < datanodeThreshold;
     }
       
     /**
@@ -4971,15 +4990,44 @@ public class FSNamesystem implements FSC
       }
       if(blockTotal < 0)
         return leaveMsg + ".";
-      String safeBlockRatioMsg = 
-        String.format("The ratio of reported blocks %.4f has " +
-          (reached == 0 ? "not " : "") + "reached the threshold %.4f. ",
-          getSafeBlockRatio(), threshold) + leaveMsg;
-      if(reached == 0 || isManual())  // threshold is not reached or manual
-        return safeBlockRatioMsg + ".";
+
+      int numLive = getNumLiveDataNodes();
+      String msg = "";
+      if (reached == 0) {
+        if (getSafeBlockRatio() < threshold) {
+          msg += String.format(
+            "The reported blocks is only %d"
+            + " but the threshold is %.4f and the total blocks %d.",
+            blockSafe, threshold, blockTotal);
+        }
+        if (numLive < datanodeThreshold) {
+          if (!"".equals(msg)) {
+            msg += "\n";
+          }
+          msg += String.format(
+            "The number of live datanodes %d needs an additional %d live "
+            + "datanodes to reach the minimum number %d.",
+            numLive, (datanodeThreshold - numLive), datanodeThreshold);
+        }
+        msg += " " + leaveMsg;
+      } else {
+        msg = String.format("The reported blocks %d has reached the threshold"
+            + " %.4f of total blocks %d.", blockSafe, threshold, 
+            blockTotal);
+
+        if (datanodeThreshold > 0) {
+          msg += String.format(" The number of live datanodes %d has reached "
+                               + "the minimum number %d.",
+                               numLive, datanodeThreshold);
+        }
+        msg += " " + leaveMsg;
+      }
+      if(reached == 0 || isManual()) {  // threshold is not reached or manual       
+        return msg + ".";
+      }
       // extension period is in progress
-      return safeBlockRatioMsg + " in " 
-            + Math.abs(reached + extension - now())/1000 + " seconds.";
+      return msg + " in " + Math.abs(reached + extension - now()) / 1000
+          + " seconds.";
     }
 
     /**
@@ -5157,7 +5205,7 @@ public class FSNamesystem implements FSC
     safeMode.leave(checkForUpgrades);
   }
     
-  String getSafeModeTip() {
+  public String getSafeModeTip() {
     if (!isInSafeMode())
       return "";
     return safeMode.getTurnOffTip();

Modified: hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/TestSafeMode.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/TestSafeMode.java?rev=1368353&r1=1368352&r2=1368353&view=diff
==============================================================================
--- hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/TestSafeMode.java (original)
+++ hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/TestSafeMode.java Thu
Aug  2 05:22:43 2012
@@ -99,6 +99,50 @@ public class TestSafeMode {
     }
   }
 
+  /**
+   * Verify that the NameNode stays in safemode when dfs.safemode.datanode.min
+   * is set to a number greater than the number of live datanodes.
+   */
+  @Test
+  public void testDatanodeThreshold() throws IOException {
+    MiniDFSCluster cluster = null;
+    DistributedFileSystem fs = null;
+    try {
+      Configuration conf = new Configuration();
+      conf.setInt(DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 0);
+      conf.setInt(DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY, 1);
+
+      // bring up a cluster with no datanodes
+      cluster = new MiniDFSCluster(conf, 0, true, null);
+      cluster.waitActive();
+      fs = (DistributedFileSystem)cluster.getFileSystem();
+
+      assertTrue("No datanode started, but we require one - safemode expected",
+                 fs.setSafeMode(SafeModeAction.SAFEMODE_GET));
+
+      String tipMsg = cluster.getNameNode().getNamesystem().getSafeModeTip();
+      assertTrue("Safemode tip message looks right",
+                 tipMsg.contains("The number of live datanodes 0 needs an " +
+                                 "additional 1 live"));
+
+      // Start a datanode
+      cluster.startDataNodes(conf, 1, true, null, null);
+
+      // Wait long enough for safemode check to refire
+      try {
+        Thread.sleep(1000);
+      } catch (InterruptedException ignored) {}
+
+      // We now should be out of safe mode.
+      assertFalse(
+        "Out of safe mode after starting datanode.",
+        fs.setSafeMode(SafeModeAction.SAFEMODE_GET));
+    } finally {
+      if (fs != null) fs.close();
+      if (cluster != null) cluster.shutdown();
+    }
+  }
+
   @Test
   public void testSafeModeWhenZeroBlockLocations() throws IOException {
     MiniDFSCluster cluster = null;



Mime
View raw message