Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 7B18C200C86 for ; Wed, 17 May 2017 03:56:52 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id 79829160BC9; Wed, 17 May 2017 01:56:52 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 991D6160BC1 for ; Wed, 17 May 2017 03:56:51 +0200 (CEST) Received: (qmail 69006 invoked by uid 500); 17 May 2017 01:56:50 -0000 Mailing-List: contact commits-help@hbase.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@hbase.apache.org Delivered-To: mailing list commits@hbase.apache.org Received: (qmail 68992 invoked by uid 99); 17 May 2017 01:56:50 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 17 May 2017 01:56:50 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 953E0DFDD5; Wed, 17 May 2017 01:56:50 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: stack@apache.org To: commits@hbase.apache.org Message-Id: <8190fb1ed5c34d37b7c04c05e1a2417f@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: hbase git commit: HBASE-18018 Changes to support abort for all procedures by default Date: Wed, 17 May 2017 01:56:50 +0000 (UTC) archived-at: Wed, 17 May 2017 01:56:52 -0000 Repository: hbase Updated Branches: refs/heads/master a8775b11d -> 5eb1b7b96 HBASE-18018 Changes to support abort for all procedures by default The default behavior for abort() method of StateMachineProcedure is changed to support aborting all procedures irrespective of if rollback is supported or not. Currently its observed that sometimes procedures may fail on a step which will be considered as retryable error as abort is not supported. As a result procedure may stuck in a endless loop repeating same step again.User should have an option to abort any stuck procedure and do clean up manually. Please refer to HBASE-18016 and discussion there. Signed-off-by: Michael Stack Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/5eb1b7b9 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/5eb1b7b9 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/5eb1b7b9 Branch: refs/heads/master Commit: 5eb1b7b96c6b870959669636057aa16e93646fa6 Parents: a8775b1 Author: Umesh Agashe Authored: Wed May 10 11:41:59 2017 -0700 Committer: Michael Stack Committed: Tue May 16 18:56:32 2017 -0700 ---------------------------------------------------------------------- .../hbase/procedure2/StateMachineProcedure.java | 37 +++++++++++++------- .../procedure2/TestStateMachineProcedure.java | 20 ++++++++++- .../master/procedure/DeleteTableProcedure.java | 9 +++++ 3 files changed, 52 insertions(+), 14 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/5eb1b7b9/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/StateMachineProcedure.java ---------------------------------------------------------------------- diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/StateMachineProcedure.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/StateMachineProcedure.java index ea2a41f..0590a93 100644 --- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/StateMachineProcedure.java +++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/StateMachineProcedure.java @@ -105,14 +105,8 @@ public abstract class StateMachineProcedure * @param state the state enum object */ protected void setNextState(final TState state) { - if (aborted.get() && isRollbackSupported(getCurrentState())) { - setAbortFailure(getClass().getSimpleName(), "abort requested"); - } else { - if (aborted.get()) { - LOG.warn("ignoring abort request " + state); - } - setNextState(getStateId(state)); - } + setNextState(getStateId(state)); + failIfAborted(); } /** @@ -147,6 +141,8 @@ public abstract class StateMachineProcedure throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException { updateTimestamp(); try { + failIfAborted(); + if (!hasMoreState() || isFailed()) return null; TState state = getCurrentState(); @@ -190,10 +186,11 @@ public abstract class StateMachineProcedure protected boolean abort(final TEnvironment env) { final boolean isDebugEnabled = LOG.isDebugEnabled(); final TState state = getCurrentState(); - if (isRollbackSupported(state)) { - if (isDebugEnabled) { - LOG.debug("abort requested for " + this + " state=" + state); - } + if (isDebugEnabled) { + LOG.debug("abort requested for " + this + " state=" + state); + } + + if (hasMoreState()) { aborted.set(true); return true; } else if (isDebugEnabled) { @@ -203,6 +200,20 @@ public abstract class StateMachineProcedure } /** + * If procedure has more states then abort it otherwise procedure is finished and abort can be + * ignored. + */ + protected final void failIfAborted() { + if (aborted.get()) { + if (hasMoreState()) { + setAbortFailure(getClass().getSimpleName(), "abort requested"); + } else { + LOG.warn("Ignoring abort request on state='" + getCurrentState() + "' for " + this); + } + } + } + + /** * Used by the default implementation of abort() to know if the current state can be aborted * and rollback can be triggered. */ @@ -219,7 +230,7 @@ public abstract class StateMachineProcedure return stateFlow != Flow.NO_MORE_STATE; } - private TState getCurrentState() { + protected TState getCurrentState() { return stateCount > 0 ? getState(states[stateCount-1]) : getInitialState(); } http://git-wip-us.apache.org/repos/asf/hbase/blob/5eb1b7b9/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestStateMachineProcedure.java ---------------------------------------------------------------------- diff --git a/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestStateMachineProcedure.java b/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestStateMachineProcedure.java index 82b767e..8347dbf 100644 --- a/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestStateMachineProcedure.java +++ b/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestStateMachineProcedure.java @@ -93,6 +93,21 @@ public class TestStateMachineProcedure { } @Test + public void testAbortStuckProcedure() throws InterruptedException { + try { + procExecutor.getEnvironment().loop = true; + TestSMProcedure proc = new TestSMProcedure(); + long procId = procExecutor.submitProcedure(proc); + Thread.sleep(1000 + (int) (Math.random() * 4001)); + proc.abort(procExecutor.getEnvironment()); + ProcedureTestingUtility.waitProcedure(procExecutor, procId); + assertEquals(true, proc.isFailed()); + } finally { + procExecutor.getEnvironment().loop = false; + } + } + + @Test public void testChildOnLastStep() { long procId = procExecutor.submitProcedure(new TestSMProcedure()); ProcedureTestingUtility.waitProcedure(procExecutor, procId); @@ -142,7 +157,9 @@ public class TestStateMachineProcedure { env.execCount.incrementAndGet(); switch (state) { case STEP_1: - setNextState(TestSMProcedureState.STEP_2); + if (!env.loop) { + setNextState(TestSMProcedureState.STEP_2); + } break; case STEP_2: addChildProcedure(new SimpleChildProcedure()); @@ -190,5 +207,6 @@ public class TestStateMachineProcedure { AtomicInteger execCount = new AtomicInteger(0); AtomicInteger rollbackCount = new AtomicInteger(0); boolean triggerChildRollback = false; + boolean loop = false; } } http://git-wip-us.apache.org/repos/asf/hbase/blob/5eb1b7b9/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/DeleteTableProcedure.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/DeleteTableProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/DeleteTableProcedure.java index 9d0a283..bda68eb 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/DeleteTableProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/DeleteTableProcedure.java @@ -146,6 +146,15 @@ public class DeleteTableProcedure } @Override + protected boolean abort(MasterProcedureEnv env) { + // TODO: Current behavior is: with no rollback and no abort support, procedure may stuck + // looping in retrying failing step forever. Default behavior of abort is changed to support + // aborting all procedures. Override the default wisely. Following code retains the current + // behavior. Revisit it later. + return isRollbackSupported(getCurrentState()) ? super.abort(env) : false; + } + + @Override protected void rollbackState(final MasterProcedureEnv env, final DeleteTableState state) { if (state == DeleteTableState.DELETE_TABLE_PRE_OPERATION) { // nothing to rollback, pre-delete is just table-state checks.