hbase-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From st...@apache.org
Subject [1/3] hbase git commit: HBASE-18261 Created RecoverMetaProcedure and used it from ServerCrashProcedure and HMaster.finishActiveMasterInitialization().
Date Mon, 31 Jul 2017 21:25:46 GMT
Repository: hbase
Updated Branches:
  refs/heads/branch-2 5490c558b -> 7bdabed27


http://git-wip-us.apache.org/repos/asf/hbase/blob/7bdabed2/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
index 1560f3b..83caf00 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
@@ -123,6 +123,7 @@ import org.apache.hadoop.hbase.master.procedure.MasterProcedureUtil;
 import org.apache.hadoop.hbase.master.procedure.ModifyColumnFamilyProcedure;
 import org.apache.hadoop.hbase.master.procedure.ModifyTableProcedure;
 import org.apache.hadoop.hbase.master.procedure.ProcedurePrepareLatch;
+import org.apache.hadoop.hbase.master.procedure.RecoverMetaProcedure;
 import org.apache.hadoop.hbase.master.procedure.TruncateTableProcedure;
 import org.apache.hadoop.hbase.master.replication.ReplicationManager;
 import org.apache.hadoop.hbase.master.snapshot.SnapshotManager;
@@ -396,9 +397,6 @@ public class HMaster extends HRegionServer implements MasterServices {
   private long splitPlanCount;
   private long mergePlanCount;
 
-  /** flag used in test cases in order to simulate RS failures during master initialization
*/
-  private volatile boolean initializationBeforeMetaAssignment = false;
-
   /* Handle favored nodes information */
   private FavoredNodesManager favoredNodesManager;
 
@@ -794,14 +792,6 @@ public class HMaster extends HRegionServer implements MasterServices
{
     status.setStatus("Wait for region servers to report in");
     waitForRegionServers(status);
 
-    // get a list for previously failed RS which need log splitting work
-    // we recover hbase:meta region servers inside master initialization and
-    // handle other failed servers in SSH in order to start up master node ASAP
-    MasterMetaBootstrap metaBootstrap = createMetaBootstrap(this, status);
-    metaBootstrap.splitMetaLogsBeforeAssignment();
-
-    this.initializationBeforeMetaAssignment = true;
-
     if (this.balancer instanceof FavoredNodesPromoter) {
       favoredNodesManager = new FavoredNodesManager(this);
     }
@@ -820,8 +810,12 @@ public class HMaster extends HRegionServer implements MasterServices
{
     if (isStopped()) return;
 
     // Make sure meta assigned before proceeding.
-    status.setStatus("Assigning Meta Region");
-    metaBootstrap.assignMeta();
+    status.setStatus("Recovering  Meta Region");
+
+    // we recover hbase:meta region servers inside master initialization and
+    // handle other failed servers in SSH in order to start up master node ASAP
+    MasterMetaBootstrap metaBootstrap = createMetaBootstrap(this, status);
+    metaBootstrap.recoverMeta();
 
     // check if master is shutting down because above assignMeta could return even hbase:meta
isn't
     // assigned when master is shutting down
@@ -2710,14 +2704,6 @@ public class HMaster extends HRegionServer implements MasterServices
{
   }
 
   /**
-   * Report whether this master has started initialization and is about to do meta region
assignment
-   * @return true if master is in initialization & about to assign hbase:meta regions
-   */
-  public boolean isInitializationStartsMetaRegionAssignment() {
-    return this.initializationBeforeMetaAssignment;
-  }
-
-  /**
    * Compute the average load across all region servers.
    * Currently, this uses a very naive computation - just uses the number of
    * regions being served, ignoring stats about number of requests.
@@ -3424,6 +3410,17 @@ public class HMaster extends HRegionServer implements MasterServices
{
     return lockManager;
   }
 
+  @Override
+  public boolean recoverMeta() throws IOException {
+    ProcedurePrepareLatch latch = ProcedurePrepareLatch.createLatch(2, 0);
+    long procId = procedureExecutor.submitProcedure(new RecoverMetaProcedure(null, true,
latch));
+    LOG.info("Waiting on RecoverMetaProcedure submitted with procId=" + procId);
+    latch.await();
+    LOG.info("Default replica of hbase:meta, location=" +
+        getMetaTableLocator().getMetaRegionLocation(getZooKeeper()));
+    return assignmentManager.isMetaInitialized();
+  }
+
   public QuotaObserverChore getQuotaObserverChore() {
     return this.quotaObserverChore;
   }

http://git-wip-us.apache.org/repos/asf/hbase/blob/7bdabed2/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java
index 049e659..7424dac 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java
@@ -19,7 +19,6 @@
 package org.apache.hadoop.hbase.master;
 
 import java.io.IOException;
-import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
 
@@ -49,45 +48,24 @@ public class MasterMetaBootstrap {
   private final MonitoredTask status;
   private final HMaster master;
 
-  private Set<ServerName> previouslyFailedServers;
-  private Set<ServerName> previouslyFailedMetaRSs;
-
   public MasterMetaBootstrap(final HMaster master, final MonitoredTask status) {
     this.master = master;
     this.status = status;
   }
 
-  public void splitMetaLogsBeforeAssignment() throws IOException, KeeperException {
+  public void recoverMeta() throws InterruptedException, IOException {
+    master.recoverMeta();
+    master.getTableStateManager().start();
+    enableCrashedServerProcessing(false);
+  }
+
+  public void processDeadServers() {
     // get a list for previously failed RS which need log splitting work
     // we recover hbase:meta region servers inside master initialization and
     // handle other failed servers in SSH in order to start up master node ASAP
-    previouslyFailedServers = master.getMasterWalManager().getFailedServersFromLogFolders();
-
-    // log splitting for hbase:meta server
-    ServerName oldMetaServerLocation = master.getMetaTableLocator()
-        .getMetaRegionLocation(master.getZooKeeper());
-    if (oldMetaServerLocation != null && previouslyFailedServers.contains(oldMetaServerLocation))
{
-      splitMetaLogBeforeAssignment(oldMetaServerLocation);
-      // Note: we can't remove oldMetaServerLocation from previousFailedServers list because
it
-      // may also host user regions
-    }
-    previouslyFailedMetaRSs = getPreviouselyFailedMetaServersFromZK();
-    // need to use union of previouslyFailedMetaRSs recorded in ZK and previouslyFailedServers
-    // instead of previouslyFailedMetaRSs alone to address the following two situations:
-    // 1) the chained failure situation(recovery failed multiple times in a row).
-    // 2) master get killed right before it could delete the recovering hbase:meta from ZK
while the
-    // same server still has non-meta wals to be replayed so that
-    // removeStaleRecoveringRegionsFromZK can't delete the stale hbase:meta region
-    // Passing more servers into splitMetaLog is all right. If a server doesn't have hbase:meta
wal,
-    // there is no op for the server.
-    previouslyFailedMetaRSs.addAll(previouslyFailedServers);
-  }
-
-  public void assignMeta() throws InterruptedException, IOException, KeeperException {
-    assignMeta(previouslyFailedMetaRSs, HRegionInfo.DEFAULT_REPLICA_ID);
-  }
+    Set<ServerName> previouslyFailedServers =
+        master.getMasterWalManager().getFailedServersFromLogFolders();
 
-  public void processDeadServers() throws IOException {
     // Master has recovered hbase:meta region server and we put
     // other failed region servers in a queue to be handled later by SSH
     for (ServerName tmpServer : previouslyFailedServers) {
@@ -99,17 +77,12 @@ public class MasterMetaBootstrap {
       throws IOException, InterruptedException, KeeperException {
     int numReplicas = master.getConfiguration().getInt(HConstants.META_REPLICAS_NUM,
            HConstants.DEFAULT_META_REPLICA_NUM);
-    final Set<ServerName> EMPTY_SET = new HashSet<>();
     for (int i = 1; i < numReplicas; i++) {
-      assignMeta(EMPTY_SET, i);
+      assignMeta(i);
     }
     unassignExcessMetaReplica(numReplicas);
   }
 
-  private void splitMetaLogBeforeAssignment(ServerName currentMetaServer) throws IOException
{
-    master.getMasterWalManager().splitMetaLog(currentMetaServer);
-  }
-
   private void unassignExcessMetaReplica(int numMetaReplicasConfigured) {
     final ZooKeeperWatcher zooKeeper = master.getZooKeeper();
     // unassign the unneeded replicas (for e.g., if the previous master was configured
@@ -137,12 +110,11 @@ public class MasterMetaBootstrap {
   /**
    * Check <code>hbase:meta</code> is assigned. If not, assign it.
    */
-  protected void assignMeta(Set<ServerName> previouslyFailedMetaRSs, int replicaId)
+  protected void assignMeta(int replicaId)
       throws InterruptedException, IOException, KeeperException {
     final AssignmentManager assignmentManager = master.getAssignmentManager();
 
     // Work on meta region
-    int assigned = 0;
     // TODO: Unimplemented
     // long timeout =
     //   master.getConfiguration().getLong("hbase.catalog.verification.timeout", 1000);
@@ -172,14 +144,14 @@ public class MasterMetaBootstrap {
     // if the meta region server is died at this time, we need it to be re-assigned
     // by SSH so that system tables can be assigned.
     // No need to wait for meta is assigned = 0 when meta is just verified.
-    if (replicaId == HRegionInfo.DEFAULT_REPLICA_ID) enableCrashedServerProcessing(assigned
!= 0);
+    if (replicaId == HRegionInfo.DEFAULT_REPLICA_ID) enableCrashedServerProcessing(false);
     LOG.info("hbase:meta with replicaId " + replicaId + ", location="
       + master.getMetaTableLocator().getMetaRegionLocation(master.getZooKeeper(), replicaId));
     status.setStatus("META assigned.");
   }
 
   private void enableCrashedServerProcessing(final boolean waitForMeta)
-      throws IOException, InterruptedException {
+      throws InterruptedException {
     // If crashed server processing is disabled, we enable it and expire those dead but not
expired
     // servers. This is required so that if meta is assigning to a server which dies after
     // assignMeta starts assignment, ServerCrashProcedure can re-assign it. Otherwise, we
will be
@@ -193,23 +165,4 @@ public class MasterMetaBootstrap {
       master.getMetaTableLocator().waitMetaRegionLocation(master.getZooKeeper());
     }
   }
-
-  /**
-   * This function returns a set of region server names under hbase:meta recovering region
ZK node
-   * @return Set of meta server names which were recorded in ZK
-   */
-  private Set<ServerName> getPreviouselyFailedMetaServersFromZK() throws KeeperException
{
-    final ZooKeeperWatcher zooKeeper = master.getZooKeeper();
-    Set<ServerName> result = new HashSet<>();
-    String metaRecoveringZNode = ZKUtil.joinZNode(zooKeeper.znodePaths.recoveringRegionsZNode,
-      HRegionInfo.FIRST_META_REGIONINFO.getEncodedName());
-    List<String> regionFailedServers = ZKUtil.listChildrenNoWatch(zooKeeper, metaRecoveringZNode);
-    if (regionFailedServers == null) return result;
-
-    for (String failedServer : regionFailedServers) {
-      ServerName server = ServerName.parseServerName(failedServer);
-      result.add(server);
-    }
-    return result;
-  }
 }

http://git-wip-us.apache.org/repos/asf/hbase/blob/7bdabed2/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterServices.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterServices.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterServices.java
index 2f8b0ee..3046b8a 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterServices.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterServices.java
@@ -500,4 +500,11 @@ public interface MasterServices extends Server {
   public String getRegionServerVersion(final ServerName sn);
 
   public void checkIfShouldMoveSystemRegionAsync();
+
+  /**
+   * Recover meta table. Will result in no-op is meta is already initialized. Any code that
has
+   * access to master and requires to access meta during process initialization can call
this
+   * method to make sure meta is initialized.
+   */
+  boolean recoverMeta() throws IOException;
 }

http://git-wip-us.apache.org/repos/asf/hbase/blob/7bdabed2/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java
index c1e39fd..019ef65 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java
@@ -151,7 +151,7 @@ public class MasterWalManager {
    * Inspect the log directory to find dead servers which need recovery work
    * @return A set of ServerNames which aren't running but still have WAL files left in file
system
    */
-  Set<ServerName> getFailedServersFromLogFolders() {
+  public Set<ServerName> getFailedServersFromLogFolders() {
     boolean retrySplitting = !conf.getBoolean("hbase.hlog.split.skip.errors",
         WALSplitter.SPLIT_SKIP_ERRORS_DEFAULT);
 

http://git-wip-us.apache.org/repos/asf/hbase/blob/7bdabed2/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
index 7836625..255ea5e 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
@@ -1448,6 +1448,8 @@ public class AssignmentManager implements ServerListener {
     synchronized (regionNode) {
       State state = regionNode.transitionState(State.OPEN, RegionStates.STATES_EXPECTED_ON_OPEN);
       if (isMetaRegion(hri)) {
+        master.getTableStateManager().setTableState(TableName.META_TABLE_NAME,
+            TableState.State.ENABLED);
         setMetaInitialized(hri, true);
       }
       regionStates.addRegionToServer(regionNode.getRegionLocation(), regionNode);

http://git-wip-us.apache.org/repos/asf/hbase/blob/7bdabed2/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RecoverMetaProcedure.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RecoverMetaProcedure.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RecoverMetaProcedure.java
new file mode 100644
index 0000000..72f0648
--- /dev/null
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RecoverMetaProcedure.java
@@ -0,0 +1,253 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hbase.master.procedure;
+
+import com.google.common.base.Preconditions;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hbase.HRegionInfo;
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.RegionReplicaUtil;
+import org.apache.hadoop.hbase.master.HMaster;
+import org.apache.hadoop.hbase.master.assignment.AssignProcedure;
+import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
+import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
+import org.apache.hadoop.hbase.procedure2.StateMachineProcedure;
+import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
+import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos;
+import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RecoverMetaState;
+import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
+import org.apache.zookeeper.KeeperException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.Set;
+
+/**
+ * This procedure recovers meta from prior shutdown/ crash of a server, and brings meta online
by
+ * assigning meta region/s. Any place where meta is accessed and requires meta to be online,
need to
+ * submit this procedure instead of duplicating steps to recover meta in the code.
+ */
+public class RecoverMetaProcedure
+    extends StateMachineProcedure<MasterProcedureEnv, MasterProcedureProtos.RecoverMetaState>
+    implements TableProcedureInterface {
+  private static final Log LOG = LogFactory.getLog(RecoverMetaProcedure.class);
+
+  private ServerName failedMetaServer;
+  private boolean shouldSplitWal;
+  private int replicaId;
+
+  private final ProcedurePrepareLatch syncLatch;
+  private HMaster master;
+
+  /**
+   * Call this constructor to queue up a {@link RecoverMetaProcedure} in response to meta
+   * carrying server crash
+   * @param failedMetaServer failed/ crashed region server that was carrying meta
+   * @param shouldSplitLog split log file of meta region
+   */
+  public RecoverMetaProcedure(final ServerName failedMetaServer, final boolean shouldSplitLog)
{
+    this(failedMetaServer, shouldSplitLog, null);
+  }
+
+  /**
+   * Constructor with latch, for blocking/ sync usage
+   */
+  public RecoverMetaProcedure(final ServerName failedMetaServer, final boolean shouldSplitLog,
+                              final ProcedurePrepareLatch latch) {
+    this.failedMetaServer = failedMetaServer;
+    this.shouldSplitWal = shouldSplitLog;
+    this.replicaId = HRegionInfo.DEFAULT_REPLICA_ID;
+    this.syncLatch = latch;
+  }
+
+  /**
+   * This constructor is also used when deserializing from a procedure store; we'll construct
one
+   * of these then call {@link #deserializeStateData(InputStream)}. Do not use directly.
+   */
+  public RecoverMetaProcedure() {
+    this(null, false);
+  }
+
+  @Override
+  protected Flow executeFromState(MasterProcedureEnv env,
+      MasterProcedureProtos.RecoverMetaState state)
+      throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException {
+    prepare(env);
+
+    if (!isRunRequired()) {
+      LOG.info(this + "; Meta already initialized. Skipping run");
+      return Flow.NO_MORE_STATE;
+    }
+
+    try {
+      switch (state) {
+        case RECOVER_META_SPLIT_LOGS:
+          LOG.info("Start " + this);
+          if (shouldSplitWal) {
+            // TODO: Matteo. We BLOCK here but most important thing to be doing at this moment.
+            if (failedMetaServer != null) {
+              master.getMasterWalManager().splitMetaLog(failedMetaServer);
+            } else {
+              ServerName serverName =
+                  master.getMetaTableLocator().getMetaRegionLocation(master.getZooKeeper());
+              Set<ServerName> previouslyFailedServers =
+                  master.getMasterWalManager().getFailedServersFromLogFolders();
+              if (serverName != null && previouslyFailedServers.contains(serverName))
{
+                master.getMasterWalManager().splitMetaLog(serverName);
+              }
+            }
+          }
+          setNextState(RecoverMetaState.RECOVER_META_ASSIGN_REGIONS);
+          break;
+
+        case RECOVER_META_ASSIGN_REGIONS:
+          HRegionInfo hri = RegionReplicaUtil.getRegionInfoForReplica(
+              HRegionInfo.FIRST_META_REGIONINFO, this.replicaId);
+
+          AssignProcedure metaAssignProcedure;
+          if (failedMetaServer != null) {
+            LOG.info(this + "; Assigning meta with new plan. previous meta server=" +
+                failedMetaServer);
+            metaAssignProcedure = master.getAssignmentManager().createAssignProcedure(hri,
true);
+          } else {
+            // get server carrying meta from zk
+            ServerName metaServer =
+                MetaTableLocator.getMetaRegionState(master.getZooKeeper()).getServerName();
+            LOG.info(this + "; Retaining meta assignment to server=" + metaServer);
+            metaAssignProcedure =
+                master.getAssignmentManager().createAssignProcedure(hri, metaServer);
+          }
+
+          addChildProcedure(metaAssignProcedure);
+          return Flow.NO_MORE_STATE;
+
+        default:
+          throw new UnsupportedOperationException("unhandled state=" + state);
+      }
+    } catch (IOException|KeeperException e) {
+      LOG.warn(this + "; Failed state=" + state + ", retry " + this + "; cycles=" +
+          getCycles(), e);
+    }
+    return Flow.HAS_MORE_STATE;
+  }
+
+  @Override
+  protected void rollbackState(MasterProcedureEnv env,
+      MasterProcedureProtos.RecoverMetaState recoverMetaState)
+      throws IOException, InterruptedException {
+    // Can't rollback
+    throw new UnsupportedOperationException("unhandled state=" + recoverMetaState);
+  }
+
+  @Override
+  protected MasterProcedureProtos.RecoverMetaState getState(int stateId) {
+    return RecoverMetaState.forNumber(stateId);
+  }
+
+  @Override
+  protected int getStateId(MasterProcedureProtos.RecoverMetaState recoverMetaState) {
+    return recoverMetaState.getNumber();
+  }
+
+  @Override
+  protected MasterProcedureProtos.RecoverMetaState getInitialState() {
+    return RecoverMetaState.RECOVER_META_SPLIT_LOGS;
+  }
+
+  @Override
+  protected void toStringClassDetails(StringBuilder sb) {
+    sb.append(getClass().getSimpleName());
+    sb.append(" failedMetaServer=");
+    sb.append(failedMetaServer);
+    sb.append(", splitWal=");
+    sb.append(shouldSplitWal);
+  }
+
+  @Override
+  protected void serializeStateData(OutputStream stream) throws IOException {
+    super.serializeStateData(stream);
+    MasterProcedureProtos.RecoverMetaStateData.Builder state =
+        MasterProcedureProtos.RecoverMetaStateData.newBuilder().setShouldSplitWal(shouldSplitWal);
+    if (failedMetaServer != null) {
+      state.setFailedMetaServer(ProtobufUtil.toServerName(failedMetaServer));
+    }
+    state.setReplicaId(replicaId);
+    state.build().writeDelimitedTo(stream);
+  }
+
+  @Override
+  protected void deserializeStateData(InputStream stream) throws IOException {
+    super.deserializeStateData(stream);
+    MasterProcedureProtos.RecoverMetaStateData state =
+        MasterProcedureProtos.RecoverMetaStateData.parseDelimitedFrom(stream);
+    this.shouldSplitWal = state.hasShouldSplitWal() && state.getShouldSplitWal();
+    this.failedMetaServer = state.hasFailedMetaServer() ?
+        ProtobufUtil.toServerName(state.getFailedMetaServer()) : null;
+    this.replicaId = state.hasReplicaId() ? state.getReplicaId() : HRegionInfo.DEFAULT_REPLICA_ID;
+  }
+
+  @Override
+  protected LockState acquireLock(MasterProcedureEnv env) {
+    if (env.getProcedureScheduler().waitTableExclusiveLock(this, TableName.META_TABLE_NAME))
{
+      return LockState.LOCK_EVENT_WAIT;
+    }
+    return LockState.LOCK_ACQUIRED;
+  }
+
+  @Override
+  protected void releaseLock(MasterProcedureEnv env) {
+    env.getProcedureScheduler().wakeTableExclusiveLock(this, TableName.META_TABLE_NAME);
+  }
+
+  @Override
+  protected void completionCleanup(MasterProcedureEnv env) {
+    ProcedurePrepareLatch.releaseLatch(syncLatch, this);
+  }
+
+  @Override
+  public TableName getTableName() {
+    return TableName.META_TABLE_NAME;
+  }
+
+  @Override
+  public TableOperationType getTableOperationType() {
+    return TableOperationType.ENABLE;
+  }
+
+  /**
+   * @return true if failedMetaServer is not null (meta carrying server crashed) or meta
is
+   * already initialized
+   */
+  private boolean isRunRequired() {
+    return failedMetaServer != null || !master.getAssignmentManager().isMetaInitialized();
+  }
+
+  /**
+   * Prepare for execution
+   */
+  private void prepare(MasterProcedureEnv env) {
+    if (master == null) {
+      master = (HMaster) env.getMasterServices();
+      Preconditions.checkArgument(master != null);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/hbase/blob/7bdabed2/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
index 4fb8c07..4f3e5ce 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -21,7 +21,6 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
 import java.util.ArrayList;
-import java.util.Collection;
 import java.util.Iterator;
 import java.util.List;
 
@@ -79,17 +78,6 @@ implements ServerProcedureInterface {
   private boolean shouldSplitWal;
 
   /**
-   * Cycles on same state. Good for figuring if we are stuck.
-   */
-  private int cycles = 0;
-
-  /**
-   * Ordinal of the previous state. So we can tell if we are progressing or not. TODO: if
useful,
-   * move this back up into StateMachineProcedure
-   */
-  private int previousState;
-
-  /**
    * Call this constructor queuing up a Procedure.
    * @param serverName Name of the crashed server.
    * @param shouldSplitWal True if we should split WALs as part of crashed server processing.
@@ -117,16 +105,6 @@ implements ServerProcedureInterface {
   @Override
   protected Flow executeFromState(MasterProcedureEnv env, ServerCrashState state)
       throws ProcedureSuspendedException, ProcedureYieldException {
-    if (LOG.isTraceEnabled()) {
-      LOG.trace(state  + " " + this + "; cycles=" + this.cycles);
-    }
-    // Keep running count of cycles
-    if (state.ordinal() != this.previousState) {
-      this.previousState = state.ordinal();
-      this.cycles = 0;
-    } else {
-      this.cycles++;
-    }
     final MasterServices services = env.getMasterServices();
     // HBASE-14802
     // If we have not yet notified that we are processing a dead server, we should do now.
@@ -182,7 +160,7 @@ implements ServerProcedureInterface {
           if (LOG.isTraceEnabled()) {
             LOG.trace("Assigning regions " +
               HRegionInfo.getShortNameToLog(regionsOnCrashedServer) + ", " + this +
-              "; cycles=" + this.cycles);
+              "; cycles=" + getCycles());
           }
           handleRIT(env, regionsOnCrashedServer);
           AssignmentManager am = env.getAssignmentManager();
@@ -200,7 +178,7 @@ implements ServerProcedureInterface {
         throw new UnsupportedOperationException("unhandled state=" + state);
       }
     } catch (IOException e) {
-      LOG.warn("Failed state=" + state + ", retry " + this + "; cycles=" + this.cycles, e);
+      LOG.warn("Failed state=" + state + ", retry " + this + "; cycles=" + getCycles(), e);
     }
     return Flow.HAS_MORE_STATE;
   }
@@ -208,15 +186,10 @@ implements ServerProcedureInterface {
   /**
    * @param env
    * @throws IOException
-   * @throws InterruptedException
    */
   private void processMeta(final MasterProcedureEnv env) throws IOException {
-    if (LOG.isDebugEnabled()) LOG.debug("Processing hbase:meta that was on " + this.serverName);
-
-    if (this.shouldSplitWal) {
-      // TODO: Matteo. We BLOCK here but most important thing to be doing at this moment.
-      env.getMasterServices().getMasterWalManager().splitMetaLog(serverName);
-    }
+    if (LOG.isDebugEnabled()) LOG.debug(this + "; Processing hbase:meta that was on " +
+        this.serverName);
 
     // Assign meta if still carrying it. Check again: region may be assigned because of RIT
timeout
     final AssignmentManager am = env.getMasterServices().getAssignmentManager();
@@ -224,19 +197,13 @@ implements ServerProcedureInterface {
       if (!isDefaultMetaRegion(hri)) continue;
 
       am.offlineRegion(hri);
-      addChildProcedure(am.createAssignProcedure(hri, true));
+      addChildProcedure(new RecoverMetaProcedure(serverName, this.shouldSplitWal));
     }
   }
 
   private boolean filterDefaultMetaRegions(final List<HRegionInfo> regions) {
     if (regions == null) return false;
-    final Iterator<HRegionInfo> it = regions.iterator();
-    while (it.hasNext()) {
-      final HRegionInfo hri = it.next();
-      if (isDefaultMetaRegion(hri)) {
-        it.remove();
-      }
-    }
+    regions.removeIf(this::isDefaultMetaRegion);
     return !regions.isEmpty();
   }
 
@@ -260,10 +227,6 @@ implements ServerProcedureInterface {
     am.getRegionStates().logSplit(this.serverName);
   }
 
-  static int size(final Collection<HRegionInfo> hris) {
-    return hris == null? 0: hris.size();
-  }
-
   @Override
   protected void rollbackState(MasterProcedureEnv env, ServerCrashState state)
   throws IOException {
@@ -273,7 +236,7 @@ implements ServerProcedureInterface {
 
   @Override
   protected ServerCrashState getState(int stateId) {
-    return ServerCrashState.valueOf(stateId);
+    return ServerCrashState.forNumber(stateId);
   }
 
   @Override
@@ -394,9 +357,8 @@ implements ServerProcedureInterface {
    * Notify them of crash. Remove assign entries from the passed in <code>regions</code>
    * otherwise we have two assigns going on and they will fight over who has lock.
    * Notify Unassigns also.
-   * @param crashedServer Server that crashed.
+   * @param env
    * @param regions Regions that were on crashed server
-   * @return Subset of <code>regions</code> that were RIT against <code>crashedServer</code>
    */
   private void handleRIT(final MasterProcedureEnv env, final List<HRegionInfo> regions)
{
     if (regions == null) return;

http://git-wip-us.apache.org/repos/asf/hbase/blob/7bdabed2/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java
index 2d30d7e..3c4dc94 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java
@@ -52,6 +52,8 @@ import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
 import com.google.protobuf.Service;
 
+import static org.mockito.Mockito.mock;
+
 public class MockNoopMasterServices implements MasterServices, Server {
   private final Configuration conf;
   private final MetricsMaster metricsMaster;
@@ -324,7 +326,7 @@ public class MockNoopMasterServices implements MasterServices, Server
{
 
   @Override
   public TableStateManager getTableStateManager() {
-    return null;
+    return mock(TableStateManager.class);
   }
 
   @Override
@@ -452,6 +454,11 @@ public class MockNoopMasterServices implements MasterServices, Server
{
   }
 
   @Override
+  public boolean recoverMeta() throws IOException {
+    return false;
+  }
+
+  @Override
   public ProcedureEvent getInitializedEvent() {
     // TODO Auto-generated method stub
     return null;

http://git-wip-us.apache.org/repos/asf/hbase/blob/7bdabed2/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterNoCluster.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterNoCluster.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterNoCluster.java
index fe0e7b1..4c06f07 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterNoCluster.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterNoCluster.java
@@ -272,7 +272,7 @@ public class TestMasterNoCluster {
       MasterMetaBootstrap createMetaBootstrap(final HMaster master, final MonitoredTask status)
{
         return new MasterMetaBootstrap(this, status) {
           @Override
-          protected void assignMeta(Set<ServerName> previouslyFailedMeatRSs, int replicaId)
{ }
+          protected void assignMeta(int replicaId) { }
         };
       }
 

http://git-wip-us.apache.org/repos/asf/hbase/blob/7bdabed2/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureTestingUtility.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureTestingUtility.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureTestingUtility.java
index d8a69a6..6dfcad1 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureTestingUtility.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureTestingUtility.java
@@ -92,13 +92,10 @@ public class MasterProcedureTestingUtility {
         public Void call() throws Exception {
           final AssignmentManager am = env.getAssignmentManager();
           am.start();
-          if (true) {
-            MasterMetaBootstrap metaBootstrap = new MasterMetaBootstrap(master,
-                TaskMonitor.get().createStatus("meta"));
-            metaBootstrap.splitMetaLogsBeforeAssignment();
-            metaBootstrap.assignMeta();
-            metaBootstrap.processDeadServers();
-          }
+          MasterMetaBootstrap metaBootstrap = new MasterMetaBootstrap(master,
+              TaskMonitor.get().createStatus("meta"));
+          metaBootstrap.recoverMeta();
+          metaBootstrap.processDeadServers();
           am.joinCluster();
           master.setInitialized(true);
           return null;

http://git-wip-us.apache.org/repos/asf/hbase/blob/7bdabed2/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerCrashProcedure.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerCrashProcedure.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerCrashProcedure.java
index b6bf0bb..0a31a84 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerCrashProcedure.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerCrashProcedure.java
@@ -84,17 +84,18 @@ public class TestServerCrashProcedure {
 
   @Test(timeout=60000)
   public void testCrashTargetRs() throws Exception {
+    testRecoveryAndDoubleExecution(false, false);
   }
 
   @Ignore  // HBASE-18366... To be enabled again.
   @Test(timeout=60000)
   public void testRecoveryAndDoubleExecutionOnRsWithMeta() throws Exception {
-    testRecoveryAndDoubleExecution(true);
+    testRecoveryAndDoubleExecution(true, true);
   }
 
   @Test(timeout=60000)
   public void testRecoveryAndDoubleExecutionOnRsWithoutMeta() throws Exception {
-    testRecoveryAndDoubleExecution(false);
+    testRecoveryAndDoubleExecution(false, true);
   }
 
   /**
@@ -102,7 +103,8 @@ public class TestServerCrashProcedure {
    * needed state.
    * @throws Exception
    */
-  private void testRecoveryAndDoubleExecution(final boolean carryingMeta) throws Exception
{
+  private void testRecoveryAndDoubleExecution(final boolean carryingMeta,
+                                              final boolean doubleExecution) throws Exception
{
     final TableName tableName = TableName.valueOf(
       "testRecoveryAndDoubleExecution-carryingMeta-" + carryingMeta);
     final Table t = this.util.createTable(tableName, HBaseTestingUtility.COLUMNS,
@@ -120,7 +122,7 @@ public class TestServerCrashProcedure {
       master.setServerCrashProcessingEnabled(false);
       // find the first server that match the request and executes the test
       ServerName rsToKill = null;
-      for (HRegionInfo hri: util.getHBaseAdmin().getTableRegions(tableName)) {
+      for (HRegionInfo hri : util.getHBaseAdmin().getTableRegions(tableName)) {
         final ServerName serverName = AssignmentTestingUtil.getServerHoldingRegion(util,
hri);
         if (AssignmentTestingUtil.isServerHoldingMeta(util, serverName) == carryingMeta)
{
           rsToKill = serverName;
@@ -135,14 +137,22 @@ public class TestServerCrashProcedure {
       master.getServerManager().moveFromOnlineToDeadServers(rsToKill);
       // Enable test flags and then queue the crash procedure.
       ProcedureTestingUtility.waitNoProcedureRunning(procExec);
-      ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, true);
-      long procId = procExec.submitProcedure(new ServerCrashProcedure(
-          procExec.getEnvironment(), rsToKill, true, carryingMeta));
-      // Now run through the procedure twice crashing the executor on each step...
-      MasterProcedureTestingUtility.testRecoveryAndDoubleExecution(procExec, procId);
+      ServerCrashProcedure scp = new ServerCrashProcedure(procExec.getEnvironment(), rsToKill,
+          true, carryingMeta);
+      if (doubleExecution) {
+        ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, true);
+        long procId = procExec.submitProcedure(scp);
+        // Now run through the procedure twice crashing the executor on each step...
+        MasterProcedureTestingUtility.testRecoveryAndDoubleExecution(procExec, procId);
+      } else {
+        ProcedureTestingUtility.submitAndWait(procExec, scp);
+      }
       // Assert all data came back.
       assertEquals(count, util.countRows(t));
       assertEquals(checksum, util.checksumRows(t));
+    } catch(Throwable throwable) {
+      LOG.error("Test failed!", throwable);
+      throw throwable;
     } finally {
       t.close();
     }


Mime
View raw message