Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 405E7200CC1 for ; Mon, 10 Jul 2017 10:00:30 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id 3EC9C16702C; Mon, 10 Jul 2017 08:00:30 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 0FAC616702A for ; Mon, 10 Jul 2017 10:00:28 +0200 (CEST) Received: (qmail 16757 invoked by uid 500); 10 Jul 2017 08:00:28 -0000 Mailing-List: contact commits-help@ariatosca.incubator.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@ariatosca.incubator.apache.org Delivered-To: mailing list commits@ariatosca.incubator.apache.org Received: (qmail 16748 invoked by uid 99); 10 Jul 2017 08:00:28 -0000 Received: from pnap-us-west-generic-nat.apache.org (HELO spamd3-us-west.apache.org) (209.188.14.142) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 10 Jul 2017 08:00:28 +0000 Received: from localhost (localhost [127.0.0.1]) by spamd3-us-west.apache.org (ASF Mail Server at spamd3-us-west.apache.org) with ESMTP id A0A33191F17 for ; Mon, 10 Jul 2017 08:00:27 +0000 (UTC) X-Virus-Scanned: Debian amavisd-new at spamd3-us-west.apache.org X-Spam-Flag: NO X-Spam-Score: -4.221 X-Spam-Level: X-Spam-Status: No, score=-4.221 tagged_above=-999 required=6.31 tests=[HK_RANDOM_FROM=0.001, KAM_ASCII_DIVIDERS=0.8, RCVD_IN_DNSWL_HI=-5, RCVD_IN_MSPIKE_H3=-0.01, RCVD_IN_MSPIKE_WL=-0.01, RP_MATCHES_RCVD=-0.001, SPF_PASS=-0.001] autolearn=disabled Received: from mx1-lw-eu.apache.org ([10.40.0.8]) by localhost (spamd3-us-west.apache.org [10.40.0.10]) (amavisd-new, port 10024) with ESMTP id jRlzd45F4RgY for ; Mon, 10 Jul 2017 08:00:15 +0000 (UTC) Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by mx1-lw-eu.apache.org (ASF Mail Server at mx1-lw-eu.apache.org) with SMTP id A17A9623DC for ; Mon, 10 Jul 2017 08:00:12 +0000 (UTC) Received: (qmail 16002 invoked by uid 99); 10 Jul 2017 08:00:11 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 10 Jul 2017 08:00:11 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 82182E0A98; Mon, 10 Jul 2017 08:00:11 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: mxmrlv@apache.org To: commits@ariatosca.incubator.apache.org Message-Id: <1fa268c61c874244ba046a76fe7aac6d@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: incubator-ariatosca git commit: wip [Forced Update!] Date: Mon, 10 Jul 2017 08:00:11 +0000 (UTC) archived-at: Mon, 10 Jul 2017 08:00:30 -0000 Repository: incubator-ariatosca Updated Branches: refs/heads/ARIA-237-Support-for-resuming-failed-workflow-executions c1daa4a96 -> 7f0b80176 (forced update) wip Project: http://git-wip-us.apache.org/repos/asf/incubator-ariatosca/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-ariatosca/commit/7f0b8017 Tree: http://git-wip-us.apache.org/repos/asf/incubator-ariatosca/tree/7f0b8017 Diff: http://git-wip-us.apache.org/repos/asf/incubator-ariatosca/diff/7f0b8017 Branch: refs/heads/ARIA-237-Support-for-resuming-failed-workflow-executions Commit: 7f0b80176556aea653c651edf6908fe1aaf9a45b Parents: 62875b5 Author: max-orlov Authored: Sun Jul 2 21:43:43 2017 +0300 Committer: max-orlov Committed: Mon Jul 10 11:00:06 2017 +0300 ---------------------------------------------------------------------- aria/modeling/orchestration.py | 4 +- aria/orchestrator/workflow_runner.py | 7 +- aria/orchestrator/workflows/core/engine.py | 18 ++- .../workflows/core/events_handler.py | 9 +- tests/modeling/test_models.py | 3 +- tests/orchestrator/test_workflow_runner.py | 128 ++++++++++++++++--- 6 files changed, 144 insertions(+), 25 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-ariatosca/blob/7f0b8017/aria/modeling/orchestration.py ---------------------------------------------------------------------- diff --git a/aria/modeling/orchestration.py b/aria/modeling/orchestration.py index 7068557..4d7a5b5 100644 --- a/aria/modeling/orchestration.py +++ b/aria/modeling/orchestration.py @@ -65,7 +65,9 @@ class ExecutionBase(mixins.ModelMixin): PENDING: (STARTED, CANCELLED), STARTED: END_STATES + (CANCELLING,), CANCELLING: END_STATES, - CANCELLED: PENDING + # Retrying + CANCELLED: PENDING, + FAILED: PENDING } # region one_to_many relationships http://git-wip-us.apache.org/repos/asf/incubator-ariatosca/blob/7f0b8017/aria/orchestrator/workflow_runner.py ---------------------------------------------------------------------- diff --git a/aria/orchestrator/workflow_runner.py b/aria/orchestrator/workflow_runner.py index df1725f..5592326 100644 --- a/aria/orchestrator/workflow_runner.py +++ b/aria/orchestrator/workflow_runner.py @@ -38,7 +38,8 @@ DEFAULT_TASK_RETRY_INTERVAL = 30 class WorkflowRunner(object): def __init__(self, model_storage, resource_storage, plugin_manager, - execution_id=None, service_id=None, workflow_name=None, inputs=None, executor=None, + execution_id=None, retry_failed=False, + service_id=None, workflow_name=None, inputs=None, executor=None, task_max_attempts=DEFAULT_TASK_MAX_ATTEMPTS, task_retry_interval=DEFAULT_TASK_RETRY_INTERVAL): """ @@ -62,6 +63,7 @@ class WorkflowRunner(object): "and service id with inputs") self._is_resume = execution_id is not None + self._retry_failed = retry_failed self._model_storage = model_storage self._resource_storage = resource_storage @@ -116,7 +118,8 @@ class WorkflowRunner(object): return self._model_storage.service.get(self._service_id) def execute(self): - self._engine.execute(ctx=self._workflow_context, resuming=self._is_resume) + self._engine.execute( + ctx=self._workflow_context, resuming=self._is_resume, retry_failed=self._retry_failed) def cancel(self): self._engine.cancel_execution(ctx=self._workflow_context) http://git-wip-us.apache.org/repos/asf/incubator-ariatosca/blob/7f0b8017/aria/orchestrator/workflows/core/engine.py ---------------------------------------------------------------------- diff --git a/aria/orchestrator/workflows/core/engine.py b/aria/orchestrator/workflows/core/engine.py index d9c77e9..69505fc 100644 --- a/aria/orchestrator/workflows/core/engine.py +++ b/aria/orchestrator/workflows/core/engine.py @@ -41,14 +41,15 @@ class Engine(logger.LoggerMixin): self._executors = executors.copy() self._executors.setdefault(StubTaskExecutor, StubTaskExecutor()) - def execute(self, ctx, resuming=False): + def execute(self, ctx, resuming=False, retry_failed=False): """ Executes the workflow. """ if resuming: - events.on_resume_workflow_signal.send(ctx) + events.on_resume_workflow_signal.send(ctx, retry_failed=retry_failed) tasks_tracker = _TasksTracker(ctx) + try: events.start_workflow_signal.send(ctx) while True: @@ -68,8 +69,15 @@ class Engine(logger.LoggerMixin): if cancel: self._terminate_tasks(tasks_tracker.executing_tasks) events.on_cancelled_workflow_signal.send(ctx) - else: + elif all(task.status == task.SUCCESS or task.ignore_failure + for task in ctx.execution.tasks): events.on_success_workflow_signal.send(ctx) + else: + exception = "Tasks {tasks} remain failed".format( + tasks= + [t for t in ctx.execution.tasks if t.status == t.SUCCESS or t.ignore_failure] + ) + events.on_failure_workflow_signal.send(ctx, exception=exception) except BaseException as e: # Cleanup any remaining tasks self._terminate_tasks(tasks_tracker.executing_tasks) @@ -124,8 +132,10 @@ class Engine(logger.LoggerMixin): class _TasksTracker(object): + def __init__(self, ctx): self._ctx = ctx + self._tasks = ctx.execution.tasks self._executed_tasks = [task for task in self._tasks if task.has_ended()] self._executable_tasks = list(set(self._tasks) - set(self._executed_tasks)) @@ -155,7 +165,7 @@ class _TasksTracker(object): def executable_tasks(self): now = datetime.utcnow() # we need both lists since retrying task are in the executing task list. - for task in self._update_tasks(self._executing_tasks + self._executable_tasks): + for task in self._update_tasks(set(self._executing_tasks + self._executable_tasks)): if all([task.is_waiting(), task.due_at <= now, all(dependency in self._executed_tasks for dependency in task.dependencies) http://git-wip-us.apache.org/repos/asf/incubator-ariatosca/blob/7f0b8017/aria/orchestrator/workflows/core/events_handler.py ---------------------------------------------------------------------- diff --git a/aria/orchestrator/workflows/core/events_handler.py b/aria/orchestrator/workflows/core/events_handler.py index eb6f271..8a6c03d 100644 --- a/aria/orchestrator/workflows/core/events_handler.py +++ b/aria/orchestrator/workflows/core/events_handler.py @@ -123,11 +123,18 @@ def _workflow_cancelled(workflow_context, *args, **kwargs): @events.on_resume_workflow_signal.connect -def _workflow_resume(workflow_context, *args, **kwargs): +def _workflow_resume(workflow_context, retry_failed=False, *args, **kwargs): with workflow_context.persist_changes: execution = workflow_context.execution execution.status = execution.PENDING + if retry_failed: + for task in execution.tasks: + if task.status == task.FAILED and not task.ignore_failure: + task.attempts_count = 0 + task.status = task.PENDING + + @events.on_cancelling_workflow_signal.connect def _workflow_cancelling(workflow_context, *args, **kwargs): http://git-wip-us.apache.org/repos/asf/incubator-ariatosca/blob/7f0b8017/tests/modeling/test_models.py ---------------------------------------------------------------------- diff --git a/tests/modeling/test_models.py b/tests/modeling/test_models.py index bbc7352..cc3bfa7 100644 --- a/tests/modeling/test_models.py +++ b/tests/modeling/test_models.py @@ -324,8 +324,7 @@ class TestExecution(object): Execution.STARTED: [Execution.PENDING], Execution.CANCELLING: [Execution.PENDING, Execution.STARTED], - Execution.FAILED: [Execution.PENDING, - Execution.STARTED, + Execution.FAILED: [Execution.STARTED, Execution.SUCCEEDED, Execution.CANCELLED, Execution.CANCELLING], http://git-wip-us.apache.org/repos/asf/incubator-ariatosca/blob/7f0b8017/tests/orchestrator/test_workflow_runner.py ---------------------------------------------------------------------- diff --git a/tests/orchestrator/test_workflow_runner.py b/tests/orchestrator/test_workflow_runner.py index 3527f34..f8aabd1 100644 --- a/tests/orchestrator/test_workflow_runner.py +++ b/tests/orchestrator/test_workflow_runner.py @@ -50,7 +50,8 @@ custom_events = { 'is_resumed': Event(), 'is_active': Event(), 'execution_cancelled': Event(), - 'execution_ended': Event() + 'execution_ended': Event(), + 'execution_failed': Event() } @@ -161,7 +162,8 @@ def test_execute(request, service): assert engine_kwargs['ctx'].execution.workflow_name == 'test_workflow' mock_engine_execute.assert_called_once_with(ctx=workflow_runner._workflow_context, - resuming=False) + resuming=False, + retry_failed=False) def test_cancel_execution(request): @@ -348,7 +350,7 @@ class TestResumableWorkflows(object): self._create_interface(workflow_context, node, mock_resuming_task) wf_runner = self._create_initial_workflow_runner( - workflow_context, mock_parallel_workflow, thread_executor) + workflow_context, mock_two_parallel_tasks_workflow, thread_executor) wf_thread = Thread(target=wf_runner.execute) wf_thread.daemon = True @@ -386,14 +388,14 @@ class TestResumableWorkflows(object): assert node.attributes['invocations'].value == 3 assert wf_runner.execution.status == wf_runner.execution.SUCCEEDED - def test_resume_failed_task(self, workflow_context, thread_executor): + def test_resume_single_failed_task(self, workflow_context, thread_executor): node = workflow_context.model.node.get_by_name(tests_mock.models.DEPENDENCY_NODE_NAME) node.attributes['invocations'] = models.Attribute.wrap('invocations', 0) - self._create_interface(workflow_context, node, mock_failed_first_task) + self._create_interface(workflow_context, node, mock_conditional_failing_task) wf_runner = self._create_initial_workflow_runner( - workflow_context, mock_sequential_workflow, thread_executor) + workflow_context, mock_single_task_workflow, thread_executor) wf_thread = Thread(target=wf_runner.execute) wf_thread.setDaemon(True) wf_thread.start() @@ -435,6 +437,76 @@ class TestResumableWorkflows(object): assert task.status == task.SUCCESS assert wf_runner.execution.status == wf_runner.execution.SUCCEEDED + def test_resume_failed_task_and_successful_task(self, workflow_context, thread_executor): + node = workflow_context.model.node.get_by_name(tests_mock.models.DEPENDENCY_NODE_NAME) + node.attributes['invocations'] = models.Attribute.wrap('invocations', 0) + self._create_interface(workflow_context, node, mock_three_different_tasks) + + wf_runner = self._create_initial_workflow_runner( + workflow_context, mock_three_parallel_tasks_workflow, thread_executor) + wf_thread = Thread(target=wf_runner.execute) + wf_thread.setDaemon(True) + wf_thread.start() + + if custom_events['execution_ended'].wait(60) is False: + raise TimeoutError("Execution did not end") + + tasks = workflow_context.model.task.list(filters={'_stub_type': None}) + assert node.attributes['invocations'].value == 3 + + # First task passes + assert any(task.status == task.FAILED for task in tasks) + # Second task fails + assert any(task.status == task.SUCCESS for task in tasks) + # third task remains stuck + + assert wf_runner.execution.status in wf_runner.execution.FAILED + + custom_events['is_resumed'].set() + # third task ends + assert node.attributes['invocations'].value == 3 + + # Create a new workflow runner, with an existing execution id. This would cause + # the old execution to restart. + new_thread_executor = thread.ThreadExecutor() + try: + new_wf_runner = WorkflowRunner( + service_id=wf_runner.service.id, + inputs={}, + model_storage=workflow_context.model, + resource_storage=workflow_context.resource, + plugin_manager=None, + execution_id=wf_runner.execution.id, + executor=new_thread_executor) + + new_wf_runner.execute() + finally: + new_thread_executor.close() + assert node.attributes['invocations'].value == 3 + + # Resuming the workflow with retry failed should retry only the second task ( + # as it was the only one failed) + new_thread_executor = thread.ThreadExecutor() + try: + new_wf_runner = WorkflowRunner( + service_id=wf_runner.service.id, + retry_failed=True, + inputs={}, + model_storage=workflow_context.model, + resource_storage=workflow_context.resource, + plugin_manager=None, + execution_id=wf_runner.execution.id, + executor=new_thread_executor) + + new_wf_runner.execute() + finally: + new_thread_executor.close() + + # Wait for it to finish and assert changes. + assert node.attributes['invocations'].value == 4 + assert all(task.status == task.SUCCESS for task in tasks) + assert wf_runner.execution.status == wf_runner.execution.SUCCEEDED + @staticmethod @pytest.fixture def thread_executor(): @@ -494,7 +566,7 @@ class TestResumableWorkflows(object): @workflow -def mock_parallel_workflow(ctx, graph): +def mock_two_parallel_tasks_workflow(ctx, graph): node = ctx.model.node.get_by_name(tests_mock.models.DEPENDENCY_NODE_NAME) graph.add_tasks( api.task.OperationTask( @@ -517,19 +589,16 @@ def mock_resuming_task(ctx): @workflow -def mock_sequential_workflow(ctx, graph): +def mock_single_task_workflow(ctx, graph): node = ctx.model.node.get_by_name(tests_mock.models.DEPENDENCY_NODE_NAME) - graph.sequence( - api.task.OperationTask(node, - interface_name='aria.interfaces.lifecycle', - operation_name='create', - retry_interval=1, - max_attempts=10), + graph.add_tasks( + api.task.OperationTask( + node, 'aria.interfaces.lifecycle', 'create', retry_interval=1, max_attempts=10) ) @operation -def mock_failed_first_task(ctx): +def mock_conditional_failing_task(ctx): """ The task should run atmost ctx.task.max_attempts - 1 times, and only then pass. overall, the number of invocations should be ctx.task.max_attempts - 1 @@ -548,3 +617,32 @@ def mock_failed_first_task(ctx): else: # fail o.w. raise BaseException("stop this task") + + +@workflow +def mock_three_parallel_tasks_workflow(ctx, graph): + node = ctx.model.node.get_by_name(tests_mock.models.DEPENDENCY_NODE_NAME) + graph.add_tasks( + api.task.OperationTask( + node, 'aria.interfaces.lifecycle', 'create', retry_interval=1, max_attempts=1), + api.task.OperationTask( + node, 'aria.interfaces.lifecycle', 'create', retry_interval=1, max_attempts=1), + api.task.OperationTask( + node, 'aria.interfaces.lifecycle', 'create', retry_interval=1, max_attempts=1), + ) + + +@operation +def mock_three_different_tasks(ctx): + ctx.node.attributes['invocations'] += 1 + + # The first task should end gracefully + + if ctx.node.attributes['invocations'] == 2: + # The second task should fail + raise BaseException("First execution should fail") + + elif ctx.node.attributes['invocations'] == 3: + # The third task should remain stuck until the execution is resumed + while not custom_events['is_resumed'].isSet(): + pass