hbase-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From apurt...@apache.org
Subject hbase git commit: HBASE-20087 Periodically attempt redeploy of regions in FAILED_OPEN state
Date Wed, 28 Feb 2018 02:14:53 GMT
Repository: hbase
Updated Branches:
  refs/heads/branch-1 9a7a36750 -> 1be9a3d64


HBASE-20087 Periodically attempt redeploy of regions in FAILED_OPEN state

Signed-off-by: Josh Elser <elserj@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/1be9a3d6
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/1be9a3d6
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/1be9a3d6

Branch: refs/heads/branch-1
Commit: 1be9a3d641569be7fbe3267cd0683397eb046366
Parents: 9a7a367
Author: Andrew Purtell <apurtell@apache.org>
Authored: Mon Feb 26 11:44:39 2018 -0800
Committer: Andrew Purtell <apurtell@apache.org>
Committed: Tue Feb 27 17:11:44 2018 -0800

----------------------------------------------------------------------
 .../hbase/rsgroup/RSGroupInfoManagerImpl.java   | 73 --------------------
 .../hadoop/hbase/master/AssignmentManager.java  | 28 ++++++++
 .../hadoop/hbase/master/RegionStates.java       |  7 ++
 3 files changed, 35 insertions(+), 73 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/1be9a3d6/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupInfoManagerImpl.java
----------------------------------------------------------------------
diff --git a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupInfoManagerImpl.java
b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupInfoManagerImpl.java
index caa7fc6..be15fa5 100644
--- a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupInfoManagerImpl.java
+++ b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupInfoManagerImpl.java
@@ -66,7 +66,6 @@ import org.apache.hadoop.hbase.constraint.ConstraintException;
 import org.apache.hadoop.hbase.coprocessor.MultiRowMutationEndpoint;
 import org.apache.hadoop.hbase.ipc.CoprocessorRpcChannel;
 import org.apache.hadoop.hbase.master.MasterServices;
-import org.apache.hadoop.hbase.master.RegionState;
 import org.apache.hadoop.hbase.master.ServerListener;
 import org.apache.hadoop.hbase.master.procedure.CreateTableProcedure;
 import org.apache.hadoop.hbase.master.procedure.ProcedurePrepareLatch;
@@ -121,7 +120,6 @@ public class RSGroupInfoManagerImpl implements RSGroupInfoManager, ServerListene
   private volatile Set<String> prevRSGroups;
   private RSGroupSerDe rsGroupSerDe;
   private DefaultServerUpdater defaultServerUpdater;
-  private FailedOpenUpdater failedOpenUpdater;
   private boolean isInit = false;
 
   public RSGroupInfoManagerImpl(MasterServices master) throws IOException {
@@ -139,8 +137,6 @@ public class RSGroupInfoManagerImpl implements RSGroupInfoManager, ServerListene
     refresh();
     defaultServerUpdater = new DefaultServerUpdater(this);
     Threads.setDaemonThreadRunning(defaultServerUpdater);
-    failedOpenUpdater = new FailedOpenUpdater(this);
-    Threads.setDaemonThreadRunning(failedOpenUpdater);
     master.getServerManager().registerListener(this);
     isInit = true;
   }
@@ -493,7 +489,6 @@ public class RSGroupInfoManagerImpl implements RSGroupInfoManager, ServerListene
   @Override
   public void serverAdded(ServerName serverName) {
     defaultServerUpdater.serverChanged();
-    failedOpenUpdater.serverChanged();
   }
 
   @Override
@@ -561,74 +556,6 @@ public class RSGroupInfoManagerImpl implements RSGroupInfoManager, ServerListene
     }
   }
 
-  private static class FailedOpenUpdater extends Thread {
-    private static final Log LOG = LogFactory.getLog(FailedOpenUpdater.class);
-
-    private final RSGroupInfoManagerImpl mgr;
-    private final long waitInterval;
-    private volatile boolean hasChanged = false;
-
-    public FailedOpenUpdater(RSGroupInfoManagerImpl mgr) {
-      this.mgr = mgr;
-      this.waitInterval = mgr.master.getConfiguration().getLong(REASSIGN_WAIT_INTERVAL_KEY,
-        DEFAULT_REASSIGN_WAIT_INTERVAL);
-      setName(FailedOpenUpdater.class.getName()+"-" + mgr.master.getServerName());
-      setDaemon(true);
-    }
-
-    @Override
-    public void run() {
-      while (!mgr.master.isAborted() && !mgr.master.isStopped()) {
-        boolean interrupted = false;
-        try {
-          synchronized (this) {
-            while (!hasChanged) {
-              wait();
-            }
-            hasChanged = false;
-          }
-        } catch (InterruptedException e) {
-          LOG.warn("Interrupted", e);
-          interrupted = true;
-        }
-        if (mgr.master.isAborted() || mgr.master.isStopped() || interrupted) {
-          continue;
-        }
-
-        // First, wait a while in case more servers are about to rejoin the cluster
-        try {
-          Thread.sleep(waitInterval);
-        } catch (InterruptedException e) {
-          LOG.warn("Interrupted", e);
-        }
-        if (mgr.master.isAborted() || mgr.master.isStopped()) {
-          continue;
-        }
-
-        // Kick all regions in FAILED_OPEN state
-        List<HRegionInfo> failedAssignments = Lists.newArrayList();
-        for (RegionState state:
-            mgr.master.getAssignmentManager().getRegionStates().getRegionsInTransition())
{
-          if (state.isFailedOpen()) {
-            failedAssignments.add(state.getRegion());
-          }
-        }
-        for (HRegionInfo region: failedAssignments) {
-          LOG.info("Retrying assignment of " + region);
-          mgr.master.getAssignmentManager().unassign(region);
-        }
-      }
-    }
-
-    // Only called for server additions
-    public void serverChanged() {
-      synchronized (this) {
-        hasChanged = true;
-        this.notify();
-      }
-    }
-  }
-
   @Override
   public void waiting() {
 

http://git-wip-us.apache.org/repos/asf/hbase/blob/1be9a3d6/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
index 6dec3ce..d9345d8 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
@@ -140,6 +140,9 @@ public class AssignmentManager extends ZooKeeperListener {
     = "hbase.assignment.already.intransition.waittime";
   static final int DEFAULT_ALREADY_IN_TRANSITION_WAITTIME = 60000; // 1 minute
 
+  static final String FAILED_OPEN_RETRY_KEY = "hbase.assignment.failed.open.retry.period";
+  static final int FAILED_OPEN_RETRY_DEFAULT = 300000; // 5 minutes
+
   protected final MasterServices server;
 
   private ServerManager serverManager;
@@ -352,6 +355,12 @@ public class AssignmentManager extends ZooKeeperListener {
     this.retryConfig.setMaxSleepTime(conf.getLong("hbase.assignment.retry.sleep.max",
         retryConfig.getSleepInterval()));
     this.backoffPolicy = getBackoffPolicy();
+
+    int failedOpenRetryPeriod = conf.getInt(FAILED_OPEN_RETRY_KEY, FAILED_OPEN_RETRY_DEFAULT);
+    if (failedOpenRetryPeriod > 0) {
+      scheduledThreadPoolExecutor.scheduleWithFixedDelay(new FailedOpenRetryRunnable(),
+        failedOpenRetryPeriod, failedOpenRetryPeriod, TimeUnit.MILLISECONDS);
+    }
   }
 
   /**
@@ -4757,4 +4766,23 @@ public class AssignmentManager extends ZooKeeperListener {
   public static void setTestSkipMergeHandling(boolean skipMergeHandling) {
     TEST_SKIP_MERGE_HANDLING = skipMergeHandling;
   }
+
+  /**
+   * Scheduled task that will attempt to redeploy regions that have transitioned permanently
into
+   * FAILED_OPEN state.
+   */
+  class FailedOpenRetryRunnable implements Runnable {
+    @Override
+    public void run() {
+      // Kick regions that have been transitioned into permanent FAILED_OPEN state
+      for (RegionState s: getRegionStates().getAllRegions()) {
+        if (s.isFailedOpen()) {
+          LOG.info("Retrying failed assignment for " + s.toDescriptiveString());
+          // Run the entire unassign protocol for safety's sake
+          unassign(s.getRegion());
+        }
+      }
+    }
+  }
+
 }

http://git-wip-us.apache.org/repos/asf/hbase/blob/1be9a3d6/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java
index f47d555..4ce1db3 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java
@@ -232,6 +232,13 @@ public class RegionStates {
   }
 
   /**
+   * Get all regions and their states
+   */
+  public synchronized Set<RegionState> getAllRegions() {
+    return new HashSet<RegionState>(regionStates.values());
+  }
+
+  /**
    * @return a set of the regions in transition that are sorted by timestamp
    */
   public synchronized SortedSet<RegionState> getRegionsInTransitionOrderedByTimestamp()
{


Mime
View raw message