hbase-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ramkris...@apache.org
Subject svn commit: r1343326 - in /hbase/branches/0.94/src: main/java/org/apache/hadoop/hbase/master/ test/java/org/apache/hadoop/hbase/regionserver/
Date Mon, 28 May 2012 17:17:09 GMT
Author: ramkrishna
Date: Mon May 28 17:17:08 2012
New Revision: 1343326

URL: http://svn.apache.org/viewvc?rev=1343326&view=rev
Log:
HBASE-5916 RS restart just before master intialization we make the cluster non operative(RajeshBabu)

Modified:
    hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
    hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
    hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
    hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
    hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java

Modified: hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java?rev=1343326&r1=1343325&r2=1343326&view=diff
==============================================================================
--- hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
(original)
+++ hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
Mon May 28 17:17:08 2012
@@ -333,7 +333,7 @@ public class AssignmentManager extends Z
    * @throws KeeperException
    * @throws InterruptedException
    */
-  void joinCluster(final Set<ServerName> onlineServers) throws IOException,
+  void joinCluster() throws IOException,
       KeeperException, InterruptedException {
     // Concurrency note: In the below the accesses on regionsInTransition are
     // outside of a synchronization block where usually all accesses to RIT are
@@ -345,7 +345,7 @@ public class AssignmentManager extends Z
 
     // Scan META to build list of existing regions, servers, and assignment
     // Returns servers who have not checked in (assumed dead) and their regions
-    Map<ServerName, List<Pair<HRegionInfo, Result>>> deadServers = rebuildUserRegions(onlineServers);
+    Map<ServerName, List<Pair<HRegionInfo, Result>>> deadServers = rebuildUserRegions();
 
     processDeadServersAndRegionsInTransition(deadServers);
 
@@ -356,16 +356,6 @@ public class AssignmentManager extends Z
   }
 
   /**
-   * Only used for tests
-   * @throws IOException
-   * @throws KeeperException
-   * @throws InterruptedException
-   */
-  void joinCluster() throws IOException, KeeperException, InterruptedException {
-    joinCluster(serverManager.getOnlineServers().keySet());
-  }
-
-  /**
    * Process all regions that are in transition up in zookeeper.  Used by
    * master joining an already running cluster.
    * @throws KeeperException
@@ -2456,11 +2446,12 @@ public class AssignmentManager extends Z
    *         in META
    * @throws IOException
    */
-  Map<ServerName, List<Pair<HRegionInfo, Result>>> rebuildUserRegions(
-      final Set<ServerName> onlineServers)
-  throws IOException, KeeperException {
+  Map<ServerName, List<Pair<HRegionInfo, Result>>> rebuildUserRegions()
throws IOException,
+      KeeperException {
     // Region assignment from META
     List<Result> results = MetaReader.fullScan(this.catalogTracker);
+    // Get any new but slow to checkin region server that joined the cluster
+    Set<ServerName> onlineServers = serverManager.getOnlineServers().keySet();    
     // Map of offline servers and their regions to be returned
     Map<ServerName, List<Pair<HRegionInfo,Result>>> offlineServers =
       new TreeMap<ServerName, List<Pair<HRegionInfo, Result>>>();
@@ -2658,8 +2649,14 @@ public class AssignmentManager extends Z
       Map<ServerName, List<Pair<HRegionInfo, Result>>> deadServers,
       List<String> nodes) throws IOException, KeeperException {
     if (null != deadServers) {
+      Set<ServerName> actualDeadServers = this.serverManager.getDeadServers();
       for (Map.Entry<ServerName, List<Pair<HRegionInfo, Result>>> deadServer
: 
         deadServers.entrySet()) {
+        // skip regions of dead servers because SSH will process regions during rs expiration.
+        // see HBASE-5916
+        if (actualDeadServers.contains(deadServer.getKey())) {
+          continue;
+        }
         List<Pair<HRegionInfo, Result>> regions = deadServer.getValue();
         for (Pair<HRegionInfo, Result> region : regions) {
           HRegionInfo regionInfo = region.getFirst();

Modified: hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1343326&r1=1343325&r2=1343326&view=diff
==============================================================================
--- hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Mon May
28 17:17:08 2012
@@ -516,11 +516,9 @@ Server {
     }
 
     this.assignmentManager.startTimeOutMonitor();
-    Set<ServerName> onlineServers = new HashSet<ServerName>(serverManager
-        .getOnlineServers().keySet());
     // TODO: Should do this in background rather than block master startup
     status.setStatus("Splitting logs after master startup");
-    splitLogAfterStartup(this.fileSystemManager, onlineServers);
+    splitLogAfterStartup(this.fileSystemManager);
 
     // Make sure root and meta assigned before proceeding.
     assignRootAndMeta(status);
@@ -536,7 +534,7 @@ Server {
 
     // Fixup assignment manager status
     status.setStatus("Starting assignment manager");
-    this.assignmentManager.joinCluster(onlineServers);
+    this.assignmentManager.joinCluster();
 
     this.balancer.setClusterStatus(getClusterStatus());
     this.balancer.setMasterServices(this);
@@ -557,6 +555,11 @@ Server {
     LOG.info("Master has completed initialization");
     initialized = true;
 
+    // clear the dead servers with same host name and port of online server because we are
not
+    // removing dead server with same hostname and port of rs which is trying to check in
before
+    // master initialization. See HBASE-5916.
+    this.serverManager.clearDeadServersWithSameHostNameAndPortOfOnlineServer();
+    
     if (this.cpHost != null) {
       // don't let cp initialization errors kill the master
       try {
@@ -580,9 +583,8 @@ Server {
    * @param mfs
    * @param onlineServers
    */
-  protected void splitLogAfterStartup(final MasterFileSystem mfs,
-      Set<ServerName> onlineServers) {
-    mfs.splitLogAfterStartup(onlineServers);
+  protected void splitLogAfterStartup(final MasterFileSystem mfs) {
+    mfs.splitLogAfterStartup();
   }
 
   /**

Modified: hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java?rev=1343326&r1=1343325&r2=1343326&view=diff
==============================================================================
--- hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
(original)
+++ hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
Mon May 28 17:17:08 2012
@@ -186,7 +186,7 @@ public class MasterFileSystem {
    * @param onlineServers Set of online servers keyed by
    * {@link ServerName}
    */
-  void splitLogAfterStartup(final Set<ServerName> onlineServers) {
+  void splitLogAfterStartup() {
     boolean retrySplitting = !conf.getBoolean("hbase.hlog.split.skip.errors",
         HLog.SPLIT_SKIP_ERRORS_DEFAULT);
     Path logsDirPath = new Path(this.rootdir, HConstants.HREGION_LOGDIR_NAME);
@@ -195,6 +195,10 @@ public class MasterFileSystem {
       try {
         if (!this.fs.exists(logsDirPath)) return;
         FileStatus[] logFolders = FSUtils.listStatus(this.fs, logsDirPath, null);
+        // Get online servers after getting log folders to avoid log folder deletion of newly
+        // checked in region servers . see HBASE-5916
+        Set<ServerName> onlineServers = ((HMaster) master).getServerManager().getOnlineServers()
+            .keySet();
 
         if (logFolders == null || logFolders.length == 0) {
           LOG.debug("No log files to split, proceeding...");

Modified: hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java?rev=1343326&r1=1343325&r2=1343326&view=diff
==============================================================================
--- hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java (original)
+++ hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java Mon
May 28 17:17:08 2012
@@ -191,7 +191,10 @@ public class ServerManager {
           existingServer + " looks stale, new server:" + serverName);
         expireServer(existingServer);
       }
-      throw new PleaseHoldException(message);
+      if (services.isServerShutdownHandlerEnabled()) {
+        // master has completed the initialization
+        throw new PleaseHoldException(message);
+      }
     }
   }
 
@@ -239,7 +242,10 @@ public class ServerManager {
       throw new YouAreDeadException(message);
     }
 
-    if (this.deadservers.cleanPreviousInstance(serverName)) {
+    // remove dead server with same hostname and port of newly checking in rs after master
+    // initialization.See HBASE-5916 for more information.
+    if ((this.services == null || ((HMaster) this.services).isInitialized())
+        && this.deadservers.cleanPreviousInstance(serverName)) {
       // This server has now become alive after we marked it as dead.
       // We removed it's previous entry from the dead list to reflect it.
       LOG.debug(what + ":" + " Server " + serverName + " came back up," +
@@ -665,4 +671,18 @@ public class ServerManager {
       }
     }
   }
+    
+  /**
+   * To clear any dead server with same host name and port of any online server
+   */
+  void clearDeadServersWithSameHostNameAndPortOfOnlineServer() {
+    ServerName sn = null;
+    for (ServerName serverName : getOnlineServersList()) {
+      while ((sn = ServerName.
+          findServerWithSameHostnamePort(this.deadservers, serverName)) != null) {
+        this.deadservers.remove(sn);
+      }
+    }
+  }
+
 }

Modified: hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java?rev=1343326&r1=1343325&r2=1343326&view=diff
==============================================================================
--- hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java
(original)
+++ hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java
Mon May 28 17:17:08 2012
@@ -49,6 +49,8 @@ import org.apache.hadoop.hbase.master.Te
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
 import org.apache.hadoop.hbase.util.Threads;
+import org.apache.hadoop.hbase.zookeeper.ZKAssign;
+import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
 import org.apache.zookeeper.KeeperException;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
@@ -95,9 +97,8 @@ public class TestRSKilledWhenMasterIniti
     }
 
     @Override
-    protected void splitLogAfterStartup(MasterFileSystem mfs,
-        Set<ServerName> onlineServers) {
-      super.splitLogAfterStartup(mfs, onlineServers);
+    protected void splitLogAfterStartup(MasterFileSystem mfs) {
+      super.splitLogAfterStartup(mfs);
       logSplit = true;
       // If "TestingMaster.sleep" is set, sleep after log split.
       if (getConfiguration().getBoolean("TestingMaster.sleep", false)) {
@@ -212,6 +213,10 @@ public class TestRSKilledWhenMasterIniti
     while (serverManager.areDeadServersInProgress()) {
       Thread.sleep(100);
     }
+    // Create a ZKW to use in the test
+    ZooKeeperWatcher zkw = HBaseTestingUtility.getZooKeeperWatcher(TESTUTIL);
+    ZKAssign.blockUntilNoRIT(zkw);
+
     table = new HTable(TESTUTIL.getConfiguration(), TABLENAME);
     resultScanner = table.getScanner(new Scan());
     count = 0;



Mime
View raw message