hbase-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From syuanji...@apache.org
Subject hbase git commit: HBASE-14536 Balancer & SSH interfering with each other leading to unavailability
Date Sat, 17 Oct 2015 05:39:00 GMT
Repository: hbase
Updated Branches:
  refs/heads/branch-1 e8c69a592 -> 9bdb88a57


HBASE-14536 Balancer & SSH interfering with each other leading to unavailability


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/9bdb88a5
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/9bdb88a5
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/9bdb88a5

Branch: refs/heads/branch-1
Commit: 9bdb88a572ac30fb51fcc44284f51543d2b4568f
Parents: e8c69a5
Author: Stephen Yuan Jiang <syuanjiangdev@gmail.com>
Authored: Fri Oct 16 22:38:28 2015 -0700
Committer: Stephen Yuan Jiang <syuanjiangdev@gmail.com>
Committed: Fri Oct 16 22:38:40 2015 -0700

----------------------------------------------------------------------
 .../hadoop/hbase/master/AssignmentManager.java  | 46 +++++++++++++++-----
 .../hadoop/hbase/master/ServerManager.java      |  3 +-
 .../master/procedure/ServerCrashProcedure.java  | 34 ++++++++++++---
 .../procedure/TestServerCrashProcedure.java     |  4 +-
 4 files changed, 69 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/9bdb88a5/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
index 4fedbec..eef22c4 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
@@ -259,6 +259,10 @@ public class AssignmentManager extends ZooKeeperListener {
 
   private RegionStateListener regionStateListener;
 
+  public enum ServerHostRegion {
+    NOT_HOSTING_REGION, HOSTING_REGION, UNKNOWN,
+  }
+
   /**
    * Constructs a new assignment manager.
    *
@@ -3371,16 +3375,16 @@ public class AssignmentManager extends ZooKeeperListener {
     threadPoolExecutorService.submit(new UnAssignCallable(this, regionInfo));
   }
 
-  public boolean isCarryingMeta(ServerName serverName) {
+  public ServerHostRegion isCarryingMeta(ServerName serverName) {
     return isCarryingRegion(serverName, HRegionInfo.FIRST_META_REGIONINFO);
   }
 
-  public boolean isCarryingMetaReplica(ServerName serverName, int replicaId) {
+  public ServerHostRegion isCarryingMetaReplica(ServerName serverName, int replicaId) {
     return isCarryingRegion(serverName,
         RegionReplicaUtil.getRegionInfoForReplica(HRegionInfo.FIRST_META_REGIONINFO, replicaId));
   }
 
-  public boolean isCarryingMetaReplica(ServerName serverName, HRegionInfo metaHri) {
+  public ServerHostRegion isCarryingMetaReplica(ServerName serverName, HRegionInfo metaHri)
{
     return isCarryingRegion(serverName, metaHri);
   }
 
@@ -3394,7 +3398,7 @@ public class AssignmentManager extends ZooKeeperListener {
    * processing hasn't finished yet when server shutdown occurs.
    * @return whether the serverName currently hosts the region
    */
-  private boolean isCarryingRegion(ServerName serverName, HRegionInfo hri) {
+  private ServerHostRegion isCarryingRegion(ServerName serverName, HRegionInfo hri) {
     RegionTransition rt = null;
     try {
       byte [] data = ZKAssign.getData(watcher, hri.getEncodedName());
@@ -3412,17 +3416,37 @@ public class AssignmentManager extends ZooKeeperListener {
       boolean matchZK = addressFromZK.equals(serverName);
       LOG.debug("Checking region=" + hri.getRegionNameAsString() + ", zk server=" + addressFromZK
+
         " current=" + serverName + ", matches=" + matchZK);
-      return matchZK;
+      return matchZK ? ServerHostRegion.HOSTING_REGION : ServerHostRegion.NOT_HOSTING_REGION;
     }
 
     ServerName addressFromAM = regionStates.getRegionServerOfRegion(hri);
-    boolean matchAM = (addressFromAM != null &&
-      addressFromAM.equals(serverName));
-    LOG.debug("based on AM, current region=" + hri.getRegionNameAsString() +
-      " is on server=" + (addressFromAM != null ? addressFromAM : "null") +
-      " server being checked: " + serverName);
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("based on AM, current region=" + hri.getRegionNameAsString() +
+        " is on server=" + (addressFromAM != null ? addressFromAM : "null") +
+        " server being checked: " + serverName);
+    }
+    if (addressFromAM != null) {
+      return addressFromAM.equals(serverName) ?
+          ServerHostRegion.HOSTING_REGION : ServerHostRegion.NOT_HOSTING_REGION;
+    }
+
+    if (hri.isMetaRegion() && RegionReplicaUtil.isDefaultReplica(hri)) {
+      // For the Meta region (default replica), we can do one more check on MetaTableLocator
+      final ServerName serverNameInZK =
+          server.getMetaTableLocator().getMetaRegionLocation(this.server.getZooKeeper());
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Based on MetaTableLocator, the META region is on server=" +
+          (serverNameInZK == null ? "null" : serverNameInZK) +
+          " server being checked: " + serverName);
+      }
+      if (serverNameInZK != null) {
+        return serverNameInZK.equals(serverName) ?
+            ServerHostRegion.HOSTING_REGION : ServerHostRegion.NOT_HOSTING_REGION;
+      }
+    }
 
-    return matchAM;
+    // Checked everywhere, if reaching here, we are unsure whether the server is carrying
region.
+    return ServerHostRegion.UNKNOWN;
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/hbase/blob/9bdb88a5/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
index a9b1f17..af6339c 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
@@ -617,7 +617,8 @@ public class ServerManager {
       return;
     }
 
-    boolean carryingMeta = services.getAssignmentManager().isCarryingMeta(serverName);
+    boolean carryingMeta = services.getAssignmentManager().isCarryingMeta(serverName) ==
+        AssignmentManager.ServerHostRegion.HOSTING_REGION;
     this.services.getMasterProcedureExecutor().
       submitProcedure(new ServerCrashProcedure(serverName, true, carryingMeta));
     LOG.debug("Added=" + serverName +

http://git-wip-us.apache.org/repos/asf/hbase/blob/9bdb88a5/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
index fcc95b1..6bb0262 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
@@ -313,8 +313,9 @@ implements ServerProcedureInterface {
   private boolean processMeta(final MasterProcedureEnv env)
   throws IOException {
     if (LOG.isDebugEnabled()) LOG.debug("Processing hbase:meta that was on " + this.serverName);
-    MasterFileSystem mfs = env.getMasterServices().getMasterFileSystem();
-    AssignmentManager am = env.getMasterServices().getAssignmentManager();
+    MasterServices services = env.getMasterServices();
+    MasterFileSystem mfs = services.getMasterFileSystem();
+    AssignmentManager am = services.getAssignmentManager();
     HRegionInfo metaHRI = HRegionInfo.FIRST_META_REGIONINFO;
     if (this.shouldSplitWal) {
       if (this.distributedLogReplay) {
@@ -328,9 +329,31 @@ implements ServerProcedureInterface {
 
     // Assign meta if still carrying it. Check again: region may be assigned because of RIT
timeout
     boolean processed = true;
-    if (am.isCarryingMeta(serverName)) {
+    boolean shouldAssignMeta = false;
+    AssignmentManager.ServerHostRegion rsCarryingMetaRegion = am.isCarryingMeta(serverName);
+      switch (rsCarryingMetaRegion) {
+        case HOSTING_REGION:
+          LOG.info("Server " + serverName + " was carrying META. Trying to assign.");
+          am.regionOffline(HRegionInfo.FIRST_META_REGIONINFO);
+          shouldAssignMeta = true;
+          break;
+        case UNKNOWN:
+          if (!services.getMetaTableLocator().isLocationAvailable(services.getZooKeeper()))
{
+            // the meta location as per master is null. This could happen in case when meta
+            // assignment in previous run failed, while meta znode has been updated to null.
+            // We should try to assign the meta again.
+            shouldAssignMeta = true;
+            break;
+          }
+          // fall through
+        case NOT_HOSTING_REGION:
+          LOG.info("META has been assigned to otherwhere, skip assigning.");
+          break;
+        default:
+          throw new IOException("Unsupported action in MetaServerShutdownHandler");
+    }
+    if (shouldAssignMeta) {
       // TODO: May block here if hard time figuring state of meta.
-      am.regionOffline(HRegionInfo.FIRST_META_REGIONINFO);
       verifyAndAssignMetaWithRetries(env);
       if (this.shouldSplitWal && distributedLogReplay) {
         int timeout = env.getMasterConfiguration().getInt(KEY_WAIT_ON_RIT, DEFAULT_WAIT_ON_RIT);
@@ -409,7 +432,8 @@ implements ServerProcedureInterface {
     for (int i = 1; i < replicaCount; i++) {
       HRegionInfo metaHri =
           RegionReplicaUtil.getRegionInfoForReplica(HRegionInfo.FIRST_META_REGIONINFO, i);
-      if (am.isCarryingMetaReplica(this.serverName, metaHri)) {
+      if (am.isCarryingMetaReplica(this.serverName, metaHri) ==
+          AssignmentManager.ServerHostRegion.HOSTING_REGION) {
         if (LOG.isDebugEnabled()) {
           LOG.debug("Reassigning meta replica" + metaHri + " that was on " + this.serverName);
         }

http://git-wip-us.apache.org/repos/asf/hbase/blob/9bdb88a5/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerCrashProcedure.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerCrashProcedure.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerCrashProcedure.java
index 510b017..cafb0ed 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerCrashProcedure.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerCrashProcedure.java
@@ -30,6 +30,7 @@ import org.apache.hadoop.hbase.TableName;
 import org.apache.hadoop.hbase.client.ResultScanner;
 import org.apache.hadoop.hbase.client.Scan;
 import org.apache.hadoop.hbase.client.Table;
+import org.apache.hadoop.hbase.master.AssignmentManager;
 import org.apache.hadoop.hbase.master.HMaster;
 import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
 import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
@@ -103,7 +104,8 @@ public class TestServerCrashProcedure {
       master.setServerCrashProcessingEnabled(false);
       // Kill a server. Master will notice but do nothing other than add it to list of dead
servers.
       HRegionServer hrs = this.util.getHBaseCluster().getRegionServer(0);
-      boolean carryingMeta = master.getAssignmentManager().isCarryingMeta(hrs.getServerName());
+      boolean carryingMeta = (master.getAssignmentManager().isCarryingMeta(hrs.getServerName())
==
+          AssignmentManager.ServerHostRegion.HOSTING_REGION);
       this.util.getHBaseCluster().killRegionServer(hrs.getServerName());
       hrs.join();
       // Wait until the expiration of the server has arrived at the master. We won't process
it


Mime
View raw message