hbase-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From zhang...@apache.org
Subject [23/31] hbase git commit: HBASE-19815 Flakey TestAssignmentManager.testAssignWithRandExec (Part Two).
Date Fri, 19 Jan 2018 13:24:57 GMT
HBASE-19815 Flakey TestAssignmentManager.testAssignWithRandExec (Part Two).

Part One cleaned up a ClassCastException.

Part Two adds the ServerCrashProcedure#handleRIT behavior to RecoverMetaProcedure.

Adds debug in the test.


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/581fabe7
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/581fabe7
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/581fabe7

Branch: refs/heads/HBASE-19064
Commit: 581fabe7b2177a090af33517f2f7cb1cdab2c64b
Parents: 646770d
Author: Michael Stack <stack@apache.org>
Authored: Wed Jan 17 22:35:35 2018 -0800
Committer: Michael Stack <stack@apache.org>
Committed: Thu Jan 18 11:32:21 2018 -0800

----------------------------------------------------------------------
 .../master/procedure/RecoverMetaProcedure.java  | 38 +++++++++++++++++---
 .../assignment/TestAssignmentManager.java       |  9 ++++-
 2 files changed, 41 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/581fabe7/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RecoverMetaProcedure.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RecoverMetaProcedure.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RecoverMetaProcedure.java
index 50ef3e0..70d0d55 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RecoverMetaProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RecoverMetaProcedure.java
@@ -28,6 +28,8 @@ import org.apache.hadoop.hbase.client.RegionInfoBuilder;
 import org.apache.hadoop.hbase.client.RegionReplicaUtil;
 import org.apache.hadoop.hbase.master.MasterServices;
 import org.apache.hadoop.hbase.master.assignment.AssignProcedure;
+import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
+import org.apache.hadoop.hbase.master.assignment.RegionTransitionProcedure;
 import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
 import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
 import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
@@ -126,17 +128,17 @@ public class RecoverMetaProcedure
               RegionInfoBuilder.FIRST_META_REGIONINFO, this.replicaId);
 
           AssignProcedure metaAssignProcedure;
+          AssignmentManager am = master.getAssignmentManager();
           if (failedMetaServer != null) {
-            LOG.info(this + "; Assigning meta with new plan. previous meta server=" +
-                failedMetaServer);
-            metaAssignProcedure = master.getAssignmentManager().createAssignProcedure(hri);
+            handleRIT(env, hri, this.failedMetaServer);
+            LOG.info(this + "; Assigning meta with new plan; previous server=" + failedMetaServer);
+            metaAssignProcedure = am.createAssignProcedure(hri);
           } else {
             // get server carrying meta from zk
             ServerName metaServer =
                 MetaTableLocator.getMetaRegionState(master.getZooKeeper()).getServerName();
             LOG.info(this + "; Retaining meta assignment to server=" + metaServer);
-            metaAssignProcedure =
-                master.getAssignmentManager().createAssignProcedure(hri, metaServer);
+            metaAssignProcedure = am.createAssignProcedure(hri, metaServer);
           }
 
           addChildProcedure(metaAssignProcedure);
@@ -152,6 +154,32 @@ public class RecoverMetaProcedure
     return Flow.HAS_MORE_STATE;
   }
 
+  /**
+   * Is the region stuck assigning to this failedMetaServer? If so, cancel the call
+   * just as we do over in ServerCrashProcedure#handleRIT except less to do here; less context
+   * to carry.
+   */
+  private void handleRIT(MasterProcedureEnv env, RegionInfo ri, ServerName crashedServerName)
{
+    AssignmentManager am = env.getAssignmentManager();
+    RegionTransitionProcedure rtp = am.getRegionStates().getRegionTransitionProcedure(ri);
+    if (rtp == null) {
+      return; // Nothing to do. Not in RIT.
+    }
+    // Make sure the RIT is against this crashed server. In the case where there are many
+    // processings of a crashed server -- backed up for whatever reason (slow WAL split)
+    // -- then a previous SCP may have already failed an assign, etc., and it may have a
+    // new location target; DO NOT fail these else we make for assign flux.
+    ServerName rtpServerName = rtp.getServer(env);
+    if (rtpServerName == null) {
+      LOG.warn("RIT with ServerName null! " + rtp);
+    } else if (rtpServerName.equals(crashedServerName)) {
+      LOG.info("pid=" + getProcId() + " found RIT " + rtp + "; " +
+          rtp.getRegionState(env).toShortString());
+      rtp.remoteCallFailed(env, crashedServerName,
+          new ServerCrashException(getProcId(), crashedServerName));
+    }
+  }
+
   @Override
   protected void rollbackState(MasterProcedureEnv env,
       MasterProcedureProtos.RecoverMetaState recoverMetaState)

http://git-wip-us.apache.org/repos/asf/hbase/blob/581fabe7/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java
index 3ab915b..9b9f624 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java
@@ -24,6 +24,7 @@ import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
 import java.io.IOException;
+import java.io.InterruptedIOException;
 import java.net.SocketTimeoutException;
 import java.util.NavigableMap;
 import java.util.Random;
@@ -206,7 +207,7 @@ public class TestAssignmentManager {
     rsDispatcher.setMockRsExecutor(new RandRsExecutor());
     // Loop a bunch of times so we hit various combos of exceptions.
     for (int i = 0; i < 10; i++) {
-      LOG.info("" + i);
+      LOG.info("ROUND=" + i);
       AssignProcedure proc = am.createAssignProcedure(hri);
       waitOnFuture(submitProcedure(proc));
     }
@@ -445,6 +446,12 @@ public class TestAssignmentManager {
       return future.get(5, TimeUnit.SECONDS);
     } catch (ExecutionException e) {
       LOG.info("ExecutionException", e);
+      Exception ee = (Exception)e.getCause();
+      if (ee instanceof InterruptedIOException) {
+        for (Procedure p: this.master.getMasterProcedureExecutor().getProcedures()) {
+          LOG.info(p.toStringDetails());
+        }
+      }
       throw (Exception)e.getCause();
     }
   }


Mime
View raw message