From commits-return-78796-archive-asf-public=cust-asf.ponee.io@hbase.apache.org Fri Oct 5 02:19:32 2018 Return-Path: X-Original-To: archive-asf-public@cust-asf.ponee.io Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by mx-eu-01.ponee.io (Postfix) with SMTP id F310D180658 for ; Fri, 5 Oct 2018 02:19:31 +0200 (CEST) Received: (qmail 76358 invoked by uid 500); 5 Oct 2018 00:19:31 -0000 Mailing-List: contact commits-help@hbase.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@hbase.apache.org Delivered-To: mailing list commits@hbase.apache.org Received: (qmail 76349 invoked by uid 99); 5 Oct 2018 00:19:31 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 05 Oct 2018 00:19:31 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 9C57EDFE18; Fri, 5 Oct 2018 00:19:30 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: stack@apache.org To: commits@hbase.apache.org Message-Id: <9f21181933e8454ca030f943b556da6c@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: hbase git commit: HBASE-21242 [amv2] Miscellaneous minor log and assign procedure create improvements Date: Fri, 5 Oct 2018 00:19:30 +0000 (UTC) Repository: hbase Updated Branches: refs/heads/branch-2.1 8fc90a23a -> 9d34b4581 HBASE-21242 [amv2] Miscellaneous minor log and assign procedure create improvements For RIT Duration, do better than print ms/seconds. Remove redundant UI column dedicated to duration when we log it in the status field too. Make bypass log at INFO level. Make it so on complete of subprocedure, we note count of outstanding siblings so we have a clue how much further the parent has to go before it is done (Helpful when hundreds of servers doing SCP). Have the SCP run the AP preflight check before creating an AP; saves creation of thousands of APs during fixup. Don't log tablename three times when reporting remote call failed. If lock is held already, note who has it. Also log after we get lock or if we have to wait rather than log on entrance though we may later have to wait (or we may have just picked up the lock). Signed-off-by: Mike Drob Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/9d34b458 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/9d34b458 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/9d34b458 Branch: refs/heads/branch-2.1 Commit: 9d34b4581c485deb3a659c7a435d557494e61462 Parents: 8fc90a2 Author: Michael Stack Authored: Wed Sep 26 21:22:46 2018 -0700 Committer: Michael Stack Committed: Thu Oct 4 17:18:13 2018 -0700 ---------------------------------------------------------------------- .../apache/hadoop/hbase/master/RegionState.java | 6 +-- .../hadoop/hbase/procedure2/Procedure.java | 3 ++ .../hbase/procedure2/ProcedureExecutor.java | 30 +++++++++------ .../master/AssignmentManagerStatusTmpl.jamon | 3 +- .../master/assignment/AssignProcedure.java | 40 ++++++++++++++++---- .../assignment/RegionTransitionProcedure.java | 3 +- .../master/assignment/UnassignProcedure.java | 2 +- .../procedure/MasterProcedureScheduler.java | 5 ++- .../master/procedure/ServerCrashProcedure.java | 8 ++++ 9 files changed, 73 insertions(+), 27 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/9d34b458/hbase-client/src/main/java/org/apache/hadoop/hbase/master/RegionState.java ---------------------------------------------------------------------- diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/master/RegionState.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/master/RegionState.java index 7289ce8..a1e2ca6 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/master/RegionState.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/master/RegionState.java @@ -371,9 +371,9 @@ public class RegionState { public String toDescriptiveString() { long relTime = System.currentTimeMillis() - stamp; return hri.getRegionNameAsString() - + " state=" + state - + ", ts=" + new Date(stamp) + " (" + (relTime/1000) + "s ago)" - + ", server=" + serverName; + + " state=" + state + ", ts=" + new Date(stamp) + " (" + + java.time.Duration.ofMillis(relTime).toString() + + " ago), server=" + serverName; } /** http://git-wip-us.apache.org/repos/asf/hbase/blob/9d34b458/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java ---------------------------------------------------------------------- diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java index 2b66961..01dc1be 100644 --- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java +++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java @@ -883,6 +883,9 @@ public abstract class Procedure implements Comparable 0; } + /** + * @return Count of children outstanding (Badly named). + */ protected synchronized int getChildrenLatch() { return childrenLatch; } http://git-wip-us.apache.org/repos/asf/hbase/blob/9d34b458/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java ---------------------------------------------------------------------- diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java index 58e0610..cb191aa 100644 --- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java +++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java @@ -1015,21 +1015,21 @@ public class ProcedureExecutor { return false; } - LOG.debug("Begin bypass {} with lockWait={}, override={}, recursive={}", + LOG.info("Begin bypass {} with lockWait={}, override={}, recursive={}", procedure, lockWait, override, recursive); IdLock.Entry lockEntry = procExecutionLock.tryLockEntry(procedure.getProcId(), lockWait); if (lockEntry == null && !override) { - LOG.debug("Waited {} ms, but {} is still running, skipping bypass with force={}", + LOG.info("Waited {} ms, but {} is still running, skipping bypass with override={}", lockWait, procedure, override); return false; } else if (lockEntry == null) { - LOG.debug("Waited {} ms, but {} is still running, begin bypass with force={}", + LOG.info("Waited {} ms, but {} is still running, begin bypass with override={}", lockWait, procedure, override); } try { // check whether the procedure is already finished if (procedure.isFinished()) { - LOG.debug("{} is already finished, skipping bypass", procedure); + LOG.info("{} is already finished, skipping bypass", procedure); return false; } @@ -1054,7 +1054,7 @@ public class ProcedureExecutor { } }); } else { - LOG.debug("{} has children, skipping bypass", procedure); + LOG.info("{} has children, skipping bypass", procedure); return false; } } @@ -1063,7 +1063,7 @@ public class ProcedureExecutor { if (procedure.hasParent() && procedure.getState() != ProcedureState.RUNNABLE && procedure.getState() != ProcedureState.WAITING && procedure.getState() != ProcedureState.WAITING_TIMEOUT) { - LOG.debug("Bypassing procedures in RUNNABLE, WAITING and WAITING_TIMEOUT states " + LOG.info("Bypassing procedures in RUNNABLE, WAITING and WAITING_TIMEOUT states " + "(with no parent), {}", procedure); // Question: how is the bypass done here? @@ -1075,7 +1075,7 @@ public class ProcedureExecutor { // finished yet Procedure current = procedure; while (current != null) { - LOG.debug("Bypassing {}", current); + LOG.info("Bypassing {}", current); current.bypass(getEnvironment()); store.update(procedure); long parentID = current.getParentProcId(); @@ -1095,9 +1095,9 @@ public class ProcedureExecutor { if (lockEntry != null) { // add the procedure to run queue, scheduler.addFront(procedure); - LOG.debug("Bypassing {} and its ancestors successfully, adding to queue", procedure); + LOG.info("Bypassing {} and its ancestors successfully, adding to queue", procedure); } else { - LOG.debug("Bypassing {} and its ancestors successfully, but since it is already running, " + LOG.info("Bypassing {} and its ancestors successfully, but since it is already running, " + "skipping add to queue", procedure); } return true; @@ -1507,9 +1507,17 @@ public class ProcedureExecutor { procStack.release(proc); if (proc.isSuccess()) { - // update metrics on finishing the procedure + // update metrics on finishing the procedure. proc.updateMetricsOnFinish(getEnvironment(), proc.elapsedTime(), true); - LOG.info("Finished " + proc + " in " + StringUtils.humanTimeDiff(proc.elapsedTime())); + // Print out count of outstanding siblings if this procedure has a parent. + Procedure parent = null; + if (proc.hasParent()) { + parent = procedures.get(proc.getParentProcId()); + } + LOG.info("Finished {} in {}{}", + proc, + StringUtils.humanTimeDiff(proc.elapsedTime()), + parent != null? (", unfinishedSiblingCount=" + parent.getChildrenLatch()): ""); // Finalize the procedure state if (proc.getProcId() == rootProcId) { procedureFinished(proc); http://git-wip-us.apache.org/repos/asf/hbase/blob/9d34b458/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon b/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon index bb3686b..51dd438 100644 --- a/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon +++ b/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon @@ -73,7 +73,7 @@ int numOfPages = (int) Math.ceil(numOfRITs * 1.0 / ritsPerPage);
- + <%if ritStat.isRegionTwiceOverThreshold(rs.getRegion()) %> @@ -96,7 +96,6 @@ int numOfPages = (int) Math.ceil(numOfRITs * 1.0 / ritsPerPage); - <%java recordItr++; %> http://git-wip-us.apache.org/repos/asf/hbase/blob/9d34b458/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java index ac4a67f..a0297b3 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java @@ -26,7 +26,9 @@ import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.RetriesExhaustedException; +import org.apache.hadoop.hbase.client.TableState; import org.apache.hadoop.hbase.exceptions.UnexpectedStateException; +import org.apache.hadoop.hbase.master.MasterServices; import org.apache.hadoop.hbase.master.RegionState.State; import org.apache.hadoop.hbase.master.TableStateManager; import org.apache.hadoop.hbase.master.assignment.RegionStates.RegionStateNode; @@ -164,19 +166,43 @@ public class AssignProcedure extends RegionTransitionProcedure { } } - @Override - protected boolean startTransition(final MasterProcedureEnv env, final RegionStateNode regionNode) + /** + * Used by ServerCrashProcedure too skip creating Assigns if not needed. + * @return Skip out on the assign; returns 'true'/assign if exception. + */ + public static boolean assign(MasterServices masterServices, RegionInfo ri) { + try { + return assign(masterServices, + masterServices.getAssignmentManager().getRegionStates().getOrCreateRegionStateNode(ri)); + } catch (IOException e) { + LOG.warn("Letting assign proceed", e); + } + return true; + } + + protected static boolean assign(MasterServices masterServices, final RegionStateNode regionNode) throws IOException { // If the region is already open we can't do much... - if (regionNode.isInState(State.OPEN) && isServerOnline(env, regionNode)) { - LOG.info("Assigned, not reassigning; " + this + "; " + regionNode.toShortString()); + if (regionNode.isInState(State.OPEN) && + masterServices.getServerManager().isServerOnline(regionNode.getRegionLocation())) { + LOG.info("Assigned, not reassigning {}", regionNode.toShortString()); return false; } // Don't assign if table is in disabling or disabled state. - TableStateManager tsm = env.getMasterServices().getTableStateManager(); + TableStateManager tsm = masterServices.getTableStateManager(); TableName tn = regionNode.getRegionInfo().getTable(); - if (tsm.getTableState(tn).isDisabledOrDisabling()) { - LOG.info("Table " + tn + " state=" + tsm.getTableState(tn) + ", skipping " + this); + TableState ts = tsm.getTableState(tn); + if (ts.isDisabledOrDisabling()) { + LOG.info("{} so SKIPPING assign of {}", ts, regionNode.getRegionInfo().getEncodedName()); + return false; + } + return true; + } + + @Override + protected boolean startTransition(final MasterProcedureEnv env, final RegionStateNode regionNode) + throws IOException { + if (!assign(env.getMasterServices(), regionNode)) { return false; } // If the region is SPLIT, we can't assign it. But state might be CLOSED, rather than http://git-wip-us.apache.org/repos/asf/hbase/blob/9d34b458/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java index e80cdf3..9f1dd38 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java @@ -241,8 +241,7 @@ public abstract class RegionTransitionProcedure public synchronized void remoteCallFailed(final MasterProcedureEnv env, final ServerName serverName, final IOException exception) { final RegionStateNode regionNode = getRegionState(env); - LOG.warn("Remote call failed {}; {}; {}; exception={}", serverName, - this, regionNode.toShortString(), exception.getClass().getSimpleName(), exception); + LOG.warn("Remote call failed {}; {}", regionNode.toShortString(), this, exception); if (remoteCallFailed(env, regionNode, exception)) { // NOTE: This call to wakeEvent puts this Procedure back on the scheduler. // Thereafter, another Worker can be in here so DO NOT MESS WITH STATE beyond http://git-wip-us.apache.org/repos/asf/hbase/blob/9d34b458/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java index b001885..8a391da 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java @@ -306,7 +306,7 @@ public class UnassignProcedure extends RegionTransitionProcedure { exception.getClass().getSimpleName()); if (!env.getMasterServices().getServerManager().expireServer(serverName)) { // Failed to queue an expire. Lots of possible reasons including it may be already expired. - // In ServerCrashProcedure and RecoverMetaProcedure, there is a handleRIT stage where we + // In ServerCrashProcedure, there is a handleRIT stage where we // will iterator over all the RIT procedures for the related regions of a crashed RS and // fail them with ServerCrashException. You can see the isSafeToProceed method above for // more details. http://git-wip-us.apache.org/repos/asf/hbase/blob/9d34b458/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureScheduler.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureScheduler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureScheduler.java index 12b4267..02d8b10 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureScheduler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureScheduler.java @@ -686,7 +686,6 @@ public class MasterProcedureScheduler extends AbstractProcedureScheduler { boolean hasLock = true; final LockAndQueue[] regionLocks = new LockAndQueue[regionInfo.length]; for (int i = 0; i < regionInfo.length; ++i) { - LOG.info("{} checking lock on {}", procedure, regionInfo[i].getEncodedName()); assert regionInfo[i] != null; assert regionInfo[i].getTable() != null; assert regionInfo[i].getTable().equals(table): regionInfo[i] + " " + procedure; @@ -694,12 +693,16 @@ public class MasterProcedureScheduler extends AbstractProcedureScheduler { regionLocks[i] = locking.getRegionLock(regionInfo[i].getEncodedName()); if (!regionLocks[i].tryExclusiveLock(procedure)) { + LOG.info("Waiting xlock for {} held by pid={}", procedure, + regionLocks[i].getExclusiveLockProcIdOwner()); waitProcedure(regionLocks[i], procedure); hasLock = false; while (i-- > 0) { regionLocks[i].releaseExclusiveLock(procedure); } break; + } else { + LOG.info("xlock for {}", procedure); } } http://git-wip-us.apache.org/repos/asf/hbase/blob/9d34b458/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java index 775c8c2..6ec0079 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java @@ -30,6 +30,7 @@ import org.apache.hadoop.hbase.client.RegionInfoBuilder; import org.apache.hadoop.hbase.client.RegionReplicaUtil; import org.apache.hadoop.hbase.master.MasterServices; import org.apache.hadoop.hbase.master.MasterWalManager; +import org.apache.hadoop.hbase.master.assignment.AssignProcedure; import org.apache.hadoop.hbase.master.assignment.AssignmentManager; import org.apache.hadoop.hbase.master.assignment.RegionTransitionProcedure; import org.apache.hadoop.hbase.procedure2.ProcedureMetrics; @@ -176,6 +177,13 @@ public class ServerCrashProcedure // it does the check by calling am#isLogSplittingDone. List toAssign = handleRIT(env, regionsOnCrashedServer); AssignmentManager am = env.getAssignmentManager(); + // Do not create assigns for Regions on disabling or disabled Tables. + // We do this inside in the AssignProcedure. + int size = toAssign.size(); + if (toAssign.removeIf(r -> !AssignProcedure.assign(env.getMasterServices(), r))) { + LOG.debug("Dropped {} assigns because against disabling/disabled tables", + size - toAssign.size()); + } // CreateAssignProcedure will try to use the old location for the region deploy. addChildProcedure(am.createAssignProcedures(toAssign)); setNextState(ServerCrashState.SERVER_CRASH_HANDLE_RIT2);
RegionStateRIT time (ms) Retries
StateRetries
<% rs.getRegion().getEncodedName() %> <% RegionInfoDisplay.getDescriptiveNameFromRegionStateForDisplay(rs, assignmentManager.getConfiguration()) %><% (currentTime - rs.getStamp()) %> <% retryStatus %>