aurora-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ma...@apache.org
Subject git commit: Adding executor health check disabler
Date Fri, 17 Oct 2014 19:40:52 GMT
Repository: incubator-aurora
Updated Branches:
  refs/heads/master 6a9e8b00e -> 9ef14e78d


Adding executor health check disabler

The health check disabler allows health checks for a job
to be snoozed temporarily by touching a snooze file in
the job's sandbox.

Bugs closed: AURORA-795

Reviewed at https://reviews.apache.org/r/26383/


Project: http://git-wip-us.apache.org/repos/asf/incubator-aurora/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-aurora/commit/9ef14e78
Tree: http://git-wip-us.apache.org/repos/asf/incubator-aurora/tree/9ef14e78
Diff: http://git-wip-us.apache.org/repos/asf/incubator-aurora/diff/9ef14e78

Branch: refs/heads/master
Commit: 9ef14e78d54542d1ba8a068c8e481bfade71be37
Parents: 6a9e8b0
Author: David Pan <david.pan2@gmail.com>
Authored: Fri Oct 17 12:32:51 2014 -0700
Committer: Maxim Khutornenko <maxim@apache.org>
Committed: Fri Oct 17 12:32:51 2014 -0700

----------------------------------------------------------------------
 docs/user-guide.md                              | 12 ++++++
 .../aurora/executor/common/health_checker.py    | 26 +++++++++++--
 .../python/apache/aurora/executor/common/BUILD  |  1 +
 .../executor/common/test_health_checker.py      | 40 ++++++++++++++++++++
 4 files changed, 76 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/9ef14e78/docs/user-guide.md
----------------------------------------------------------------------
diff --git a/docs/user-guide.md b/docs/user-guide.md
index e12ee89..c956e60 100644
--- a/docs/user-guide.md
+++ b/docs/user-guide.md
@@ -215,6 +215,18 @@ Please see the
 [configuration reference](configuration-reference.md#user-content-healthcheckconfig-objects)
for
 configuration options for this feature.
 
+#### Snoozing Health Checks
+
+If you need to pause your health check, you can do so by touching a file inside of your sandbox,
+named `.healthchecksnooze`
+
+As long as that file is present, health checks will be disabled, enabling users to gather
core dumps
+or other performance measurements without worrying about Aurora's health check killing their
+process.
+
+WARNING: Remember to remove this when you are done, otherwise your instance will have permanently
+disabled health checks.
+
 #### Tearing a task down
 
 The Executor follows an escalation sequence when killing a running task:

http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/9ef14e78/src/main/python/apache/aurora/executor/common/health_checker.py
----------------------------------------------------------------------
diff --git a/src/main/python/apache/aurora/executor/common/health_checker.py b/src/main/python/apache/aurora/executor/common/health_checker.py
index 4980411..60676ba 100644
--- a/src/main/python/apache/aurora/executor/common/health_checker.py
+++ b/src/main/python/apache/aurora/executor/common/health_checker.py
@@ -12,6 +12,7 @@
 # limitations under the License.
 #
 
+import os.path
 import threading
 import time
 
@@ -35,6 +36,7 @@ class ThreadedHealthChecker(ExceptionalThread):
 
   def __init__(self,
       health_checker,
+      sandbox,
       interval_secs,
       initial_interval_secs,
       max_consecutive_failures,
@@ -42,6 +44,8 @@ class ThreadedHealthChecker(ExceptionalThread):
     """
     :param health_checker: health checker to confirm service health
     :type health_checker: function that returns (boolean, <string>)
+    :param sandbox: Sandbox of the task corresponding to this health check.
+    :type sandbox: DirectorySandbox
     :param interval_secs: delay between checks
     :type interval_secs: int
     :param initial_interval_secs: seconds to wait before starting checks
@@ -52,11 +56,16 @@ class ThreadedHealthChecker(ExceptionalThread):
     :type clock: time module
     """
     self.checker = health_checker
+    self.sandbox = sandbox
     self.clock = clock
     self.current_consecutive_failures = 0
     self.dead = threading.Event()
     self.interval = interval_secs
     self.max_consecutive_failures = max_consecutive_failures
+    self.snooze_file = None
+
+    if self.sandbox and self.sandbox.exists():
+      self.snooze_file = os.path.join(self.sandbox.root, '.healthchecksnooze')
 
     if initial_interval_secs is not None:
       self.initial_interval = initial_interval_secs
@@ -66,10 +75,18 @@ class ThreadedHealthChecker(ExceptionalThread):
     if self.initial_interval > 0:
       self.healthy, self.reason = True, None
     else:
-      self.healthy, self.reason = self.checker()
+      self.healthy, self.reason = self._perform_check_if_not_disabled()
     super(ThreadedHealthChecker, self).__init__()
     self.daemon = True
 
+  def _perform_check_if_not_disabled(self):
+    if self.snooze_file and os.path.isfile(self.snooze_file):
+      log.info("Health check snooze file found at %s. Health checks disabled.", self.snooze_file)
+      return True, None
+
+    log.debug("Health checks enabled. Performing health check.")
+    return self.checker()
+
   def _maybe_update_failure_count(self, is_healthy, reason):
     if not is_healthy:
       log.warning('Health check failure: %s' % reason)
@@ -88,7 +105,7 @@ class ThreadedHealthChecker(ExceptionalThread):
     self.clock.sleep(self.initial_interval)
     log.debug('Initial interval expired.')
     while not self.dead.is_set():
-      is_healthy, reason = self.checker()
+      is_healthy, reason = self._perform_check_if_not_disabled()
       self._maybe_update_failure_count(is_healthy, reason)
       self.clock.sleep(self.interval)
 
@@ -110,12 +127,14 @@ class HealthChecker(StatusChecker):
 
   def __init__(self,
                health_checker,
+               sandbox=None,
                interval_secs=10,
                initial_interval_secs=None,
                max_consecutive_failures=0,
                clock=time):
     self.threaded_health_checker = ThreadedHealthChecker(
         health_checker,
+        sandbox,
         interval_secs,
         initial_interval_secs,
         max_consecutive_failures,
@@ -136,7 +155,7 @@ class HealthChecker(StatusChecker):
 
 
 class HealthCheckerProvider(StatusCheckerProvider):
-  def from_assigned_task(self, assigned_task, _):
+  def from_assigned_task(self, assigned_task, sandbox):
     mesos_task = mesos_task_instance_from_assigned_task(assigned_task)
     portmap = resolve_ports(mesos_task, assigned_task.assignedPorts)
 
@@ -149,6 +168,7 @@ class HealthCheckerProvider(StatusCheckerProvider):
         timeout_secs=health_check_config.get('timeout_secs'))
     health_checker = HealthChecker(
         http_signaler.health,
+        sandbox,
         interval_secs=health_check_config.get('interval_secs'),
         initial_interval_secs=health_check_config.get('initial_interval_secs'),
         max_consecutive_failures=health_check_config.get('max_consecutive_failures'))

http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/9ef14e78/src/test/python/apache/aurora/executor/common/BUILD
----------------------------------------------------------------------
diff --git a/src/test/python/apache/aurora/executor/common/BUILD b/src/test/python/apache/aurora/executor/common/BUILD
index c7f7a00..318e66d 100644
--- a/src/test/python/apache/aurora/executor/common/BUILD
+++ b/src/test/python/apache/aurora/executor/common/BUILD
@@ -64,6 +64,7 @@ python_tests(
     '3rdparty/python:twitter.common.testing',
     'src/main/python/apache/aurora/common:http_signaler',
     'src/main/python/apache/aurora/executor/common:health_checker',
+    'src/main/python/apache/aurora/executor/common:sandbox',
     'src/main/thrift/org/apache/aurora/gen:py-thrift',
   ]
 )

http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/9ef14e78/src/test/python/apache/aurora/executor/common/test_health_checker.py
----------------------------------------------------------------------
diff --git a/src/test/python/apache/aurora/executor/common/test_health_checker.py b/src/test/python/apache/aurora/executor/common/test_health_checker.py
index aa36415..2be5b6c 100644
--- a/src/test/python/apache/aurora/executor/common/test_health_checker.py
+++ b/src/test/python/apache/aurora/executor/common/test_health_checker.py
@@ -12,6 +12,7 @@
 # limitations under the License.
 #
 
+import os.path
 import threading
 import time
 import unittest
@@ -29,6 +30,7 @@ from apache.aurora.executor.common.health_checker import (
     HealthCheckerProvider,
     ThreadedHealthChecker
 )
+from apache.aurora.executor.common.sandbox import SandboxInterface
 
 from gen.apache.aurora.api.ttypes import AssignedTask, ExecutorConfig, TaskConfig
 
@@ -146,17 +148,55 @@ class TestThreadedHealthChecker(unittest.TestCase):
   def setUp(self):
     self.signaler = mock.Mock(spec=HttpSignaler)
     self.signaler.health.return_value = (True, 'Fake')
+
+    self.sandbox = mock.Mock(spec_set=SandboxInterface)
+    self.sandbox.exists.return_value = True
+    self.sandbox.root = '/root'
+
     self.initial_interval_secs = 1
     self.interval_secs = 5
     self.max_consecutive_failures = 2
     self.clock = mock.Mock(spec=time)
     self.threaded_health_checker = ThreadedHealthChecker(
         self.signaler.health,
+        None,
+        self.interval_secs,
+        self.initial_interval_secs,
+        self.max_consecutive_failures,
+        self.clock)
+
+    self.threaded_health_checker_sandbox_exists = ThreadedHealthChecker(
+        self.signaler.health,
+        self.sandbox,
         self.interval_secs,
         self.initial_interval_secs,
         self.max_consecutive_failures,
         self.clock)
 
+  def test_perform_check_if_not_disabled_snooze_file_is_none(self):
+    self.threaded_health_checker.snooze_file = None
+
+    assert self.signaler.health.call_count == 0
+    self.threaded_health_checker._perform_check_if_not_disabled()
+    assert self.signaler.health.call_count == 1
+
+  @mock.patch('os.path', spec_set=os.path)
+  def test_perform_check_if_not_disabled_no_snooze_file(self, mock_os_path):
+    mock_os_path.isfile.return_value = False
+
+    assert self.signaler.health.call_count == 0
+    self.threaded_health_checker_sandbox_exists._perform_check_if_not_disabled()
+    assert self.signaler.health.call_count == 1
+
+  @mock.patch('os.path', spec_set=os.path)
+  def test_perform_check_if_not_disabled_snooze_file_exists(self, mock_os_path):
+    mock_os_path.isfile.return_value = True
+
+    assert self.signaler.health.call_count == 0
+    result = self.threaded_health_checker_sandbox_exists._perform_check_if_not_disabled()
+    assert self.signaler.health.call_count == 0
+    assert result == (True, None)
+
   def test_maybe_update_failure_count(self):
     assert self.threaded_health_checker.current_consecutive_failures == 0
     assert self.threaded_health_checker.healthy is True


Mime
View raw message