hbase-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From te...@apache.org
Subject hbase git commit: HBASE-12464 meta table region assignment stuck in the FAILED_OPEN state due to region server not fully ready to serve (Stephen Jiang)
Date Thu, 20 Nov 2014 21:51:29 GMT
Repository: hbase
Updated Branches:
  refs/heads/0.98 6b75bd5cf -> 0455f2e3f


HBASE-12464 meta table region assignment stuck in the FAILED_OPEN state due to region server
not fully ready to serve (Stephen Jiang)


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/0455f2e3
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/0455f2e3
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/0455f2e3

Branch: refs/heads/0.98
Commit: 0455f2e3fb1edc5bbc6c824223c793068fa63bd2
Parents: 6b75bd5
Author: tedyu <yuzhihong@gmail.com>
Authored: Thu Nov 20 13:51:17 2014 -0800
Committer: tedyu <yuzhihong@gmail.com>
Committed: Thu Nov 20 13:51:17 2014 -0800

----------------------------------------------------------------------
 .../hadoop/hbase/master/AssignmentManager.java  | 62 +++++++++++++++-----
 1 file changed, 48 insertions(+), 14 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/0455f2e3/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
index c9ad041..2a648b1 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
@@ -172,7 +172,7 @@ public class AssignmentManager extends ZooKeeperListener {
 
   /**
    * The sleep time for which the assignment will wait before retrying in case of hbase:meta
assignment
-   * failure due to lack of availability of region plan
+   * failure due to lack of availability of region plan or bad region plan
    */
   private final long sleepTimeBeforeRetryingMetaAssignment;
 
@@ -2029,6 +2029,7 @@ public class AssignmentManager extends ZooKeeperListener {
             + ", the server is stopped/aborted");
           return;
         }
+
         if (plan == null) { // Get a server for the region at first
           try {
             plan = getRegionPlan(region, forceNewPlan);
@@ -2036,21 +2037,24 @@ public class AssignmentManager extends ZooKeeperListener {
             LOG.warn("Failed to get region plan", e);
           }
         }
+
         if (plan == null) {
           LOG.warn("Unable to determine a plan to assign " + region);
           if (tomActivated){
             this.timeoutMonitor.setAllRegionServersOffline(true);
           } else {
             if (region.isMetaRegion()) {
-              try {
-                Thread.sleep(this.sleepTimeBeforeRetryingMetaAssignment);
-                if (i == maximumAttempts) i = 1;
+                if (i == maximumAttempts) {
+                  i = 0; // re-set attempt count to 0 for at least 1 retry
+
+                  LOG.warn("Unable to determine a plan to assign a hbase:meta region " +
region +
+                    " after maximumAttempts (" + this.maximumAttempts +
+                    "). Reset attempts count and continue retrying.");
+                }
+                waitForRetryingMetaAssignment();
                 continue;
-              } catch (InterruptedException e) {
-                LOG.error("Got exception while waiting for hbase:meta assignment");
-                Thread.currentThread().interrupt();
-              }
             }
+
             regionStates.updateRegionState(region, State.FAILED_OPEN);
           }
           return;
@@ -2137,7 +2141,6 @@ public class AssignmentManager extends ZooKeeperListener {
           boolean retry = !hold && (t instanceof java.net.SocketTimeoutException
               && this.serverManager.isServerOnline(plan.getDestination()));
 
-
           if (hold) {
             LOG.warn(assignMsg + ", waiting a little before trying on the same region server
" +
               "try=" + i + " of " + this.maximumAttempts, t);
@@ -2186,9 +2189,19 @@ public class AssignmentManager extends ZooKeeperListener {
         }
 
         if (i == this.maximumAttempts) {
-          // Don't reset the region state or get a new plan any more.
-          // This is the last try.
-          continue;
+          // For meta region, we have to keep retrying until succeeding
+          if (region.isMetaRegion()) {
+            i = 0; // re-set attempt count to 0 for at least 1 retry
+            LOG.warn(assignMsg +
+                ", trying to assign a hbase:meta region reached to maximumAttempts (" +
+                this.maximumAttempts + ").  Reset attempt counts and continue retrying.");
+            waitForRetryingMetaAssignment();
+          }
+          else {
+            // Don't reset the region state or get a new plan any more.
+            // This is the last try.
+            continue;
+          }
         }
 
         // If region opened on destination of present plan, reassigning to new
@@ -2275,6 +2288,18 @@ public class AssignmentManager extends ZooKeeperListener {
   }
 
   /**
+   * Wait for some time before retrying meta table region assignment
+   */
+  private void waitForRetryingMetaAssignment() {
+    try {
+      Thread.sleep(this.sleepTimeBeforeRetryingMetaAssignment);
+    } catch (InterruptedException e) {
+      LOG.error("Got exception while waiting for hbase:meta assignment");
+      Thread.currentThread().interrupt();
+    }
+  }
+
+  /**
    * Set region as OFFLINED up in zookeeper
    *
    * @param state
@@ -3583,12 +3608,21 @@ public class AssignmentManager extends ZooKeeperListener {
       // name, and failedOpenTracker is updated only in this block
       failedOpenTracker.put(encodedName, failedOpenCount);
     }
-    if (failedOpenCount.incrementAndGet() >= maximumAttempts) {
+    if (failedOpenCount.incrementAndGet() >= maximumAttempts && !hri.isMetaRegion()
) {
       regionStates.updateRegionState(hri, State.FAILED_OPEN);
       // remove the tracking info to save memory, also reset
       // the count for next open initiative
       failedOpenTracker.remove(encodedName);
-    } else {
+    }
+    else {
+      if (hri.isMetaRegion() && failedOpenCount.get() >= maximumAttempts) {
+        // Log a warning message if a meta region failedOpenCount exceeds maximumAttempts
+        // so that we are aware of potential problem if it persists for a long time.
+        LOG.warn("Failed to open the hbase:meta region " +
+            hri.getRegionNameAsString() + " after" +
+            failedOpenCount.get() + " retries. Continue retrying.");
+      }
+
       // Handle this the same as if it were opened and then closed.
       RegionState regionState = regionStates.updateRegionState(hri, State.CLOSED);
       if (regionState != null) {


Mime
View raw message