aurora-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ma...@apache.org
Subject git commit: Improve documentation and testing for host maintenance API
Date Wed, 11 Jun 2014 00:58:02 GMT
Repository: incubator-aurora
Updated Branches:
  refs/heads/master 768cecd08 -> 6feda1cc5


Improve documentation and testing for host maintenance API

Bugs closed: AURORA-318

Reviewed at https://reviews.apache.org/r/20285/


Project: http://git-wip-us.apache.org/repos/asf/incubator-aurora/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-aurora/commit/6feda1cc
Tree: http://git-wip-us.apache.org/repos/asf/incubator-aurora/tree/6feda1cc
Diff: http://git-wip-us.apache.org/repos/asf/incubator-aurora/diff/6feda1cc

Branch: refs/heads/master
Commit: 6feda1cc5ebbc6f3fba8159b6cd7e595b113b294
Parents: 768cecd
Author: Joe Smith <yasumoto7@gmail.com>
Authored: Tue Jun 10 17:57:34 2014 -0700
Committer: Maxim Khutornenko <maxim@apache.org>
Committed: Tue Jun 10 17:57:34 2014 -0700

----------------------------------------------------------------------
 .../apache/aurora/admin/host_maintenance.py     |  98 +++++++++---
 src/main/python/apache/aurora/client/base.py    |  13 ++
 src/test/python/apache/aurora/admin/BUILD       |   3 +
 .../aurora/admin/test_host_maintenance.py       | 151 ++++++++++++++++++-
 .../aurora/client/commands/test_maintenance.py  |   5 +-
 5 files changed, 240 insertions(+), 30 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/6feda1cc/src/main/python/apache/aurora/admin/host_maintenance.py
----------------------------------------------------------------------
diff --git a/src/main/python/apache/aurora/admin/host_maintenance.py b/src/main/python/apache/aurora/admin/host_maintenance.py
index f223c5e..71f27bf 100644
--- a/src/main/python/apache/aurora/admin/host_maintenance.py
+++ b/src/main/python/apache/aurora/admin/host_maintenance.py
@@ -26,6 +26,14 @@ from gen.apache.aurora.api.ttypes import Hosts, MaintenanceMode
 class HostMaintenance(object):
   """Submit requests to the scheduler to put hosts into and out of maintenance
   mode so they can be operated upon without causing LOST tasks.
+
+  Aurora provides a two-tiered concept of Maintenance. The first step is to initiate maintenance,
+  which will ask the Aurora scheduler to de-prioritize scheduling on a large set of hosts
(the ones
+  that will be operated upon during this maintenance window).  Once all hosts have been tagged
in
+  this manner, the operator can begin draining individual machines, which will have all user-tasks
+  killed and rescheduled.  When the tasks get placed onto a new machine, the scheduler will
first
+  look for hosts that do not have the maintenance tag, which will help decrease churn and
prevent a
+  task from being constantly killed as its hosts go down from underneath it.
   """
 
   START_MAINTENANCE_DELAY = Amount(30, Time.SECONDS)
@@ -41,24 +49,37 @@ class HostMaintenance(object):
     self._client = AuroraClientAPI(cluster, verbosity == 'verbose')
 
   def _drain_hosts(self, drainable_hosts, clock=time):
-    """This will actively turn down tasks running on hosts."""
+    """"Drains tasks from the specified hosts.
+
+    This will move active tasks on these hosts to the DRAINING state, causing them to be
+    rescheduled elsewhere.
+
+    :param drainable_hosts: Hosts that are in maintenance mode and ready to be drained
+    :type drainable_hosts: gen.apache.aurora.ttypes.Hosts
+    :param clock: time module for testing
+    :type clock: time
+    """
     check_and_log_response(self._client.drain_hosts(drainable_hosts))
-    not_ready_hosts = [hostname for hostname in drainable_hosts.hostNames]
-    while not_ready_hosts:
+    not_ready_hostnames = [hostname for hostname in drainable_hosts.hostNames]
+    while not_ready_hostnames:
       log.info("Sleeping for %s." % self.START_MAINTENANCE_DELAY)
       clock.sleep(self.START_MAINTENANCE_DELAY.as_(Time.SECONDS))
-      resp = self._client.maintenance_status(Hosts(not_ready_hosts))
+      resp = self._client.maintenance_status(Hosts(set(not_ready_hostnames)))
       if not resp.result.maintenanceStatusResult.statuses:
-        not_ready_hosts = None
+        not_ready_hostnames = None
       for host_status in resp.result.maintenanceStatusResult.statuses:
         if host_status.mode != MaintenanceMode.DRAINED:
           log.warning('%s is currently in status %s' %
               (host_status.host, MaintenanceMode._VALUES_TO_NAMES[host_status.mode]))
         else:
-          not_ready_hosts.remove(host_status.host)
+          not_ready_hostnames.remove(host_status.host)
 
   def _complete_maintenance(self, drained_hosts):
-    """End the maintenance status for a give set of hosts."""
+    """End the maintenance status for a given set of hosts.
+
+    :param drained_hosts: Hosts that are drained and finished being operated upon
+    :type drained_hosts: gen.apache.aurora.ttypes.Hosts
+    """
     check_and_log_response(self._client.end_maintenance(drained_hosts))
     resp = self._client.maintenance_status(drained_hosts)
     for host_status in resp.result.maintenanceStatusResult.statuses:
@@ -66,37 +87,68 @@ class HostMaintenance(object):
         log.warning('%s is DRAINING or in DRAINED' % host_status.host)
 
   def _operate_on_hosts(self, drained_hosts, callback):
-    """Perform a given operation on a list of hosts that are ready for maintenance."""
-    for host in drained_hosts.hostNames:
-      callback(host)
+    """Perform a given operation on a list of hosts that are ready for maintenance.
+
+    :param drained_hosts: Hosts that have been drained (via _drain_hosts)
+    :type drained_hosts: gen.apache.aurora.ttypes.Hosts
+    :param callback: Function to call one hostname at a time
+    :type callback: function
+    """
+    for hostname in drained_hosts.hostNames:
+      callback(hostname)
 
-  def end_maintenance(self, hosts):
-    """Pull a list of hosts out of maintenance mode."""
-    self._complete_maintenance(Hosts(set(hosts)))
+  def end_maintenance(self, hostnames):
+    """Pull a list of hostnames out of maintenance mode.
 
-  def start_maintenance(self, hosts):
-    """Put a list of hosts into maintenance mode, to de-prioritize scheduling."""
-    check_and_log_response(self._client.start_maintenance(Hosts(set(hosts))))
+    :param hostnames: List of hosts to operate upon
+    :type hostnames: list of strings
+    """
+    self._complete_maintenance(Hosts(set(hostnames)))
+
+  def start_maintenance(self, hostnames):
+    """Put a list of hostnames into maintenance mode, to de-prioritize scheduling.
+
+    This is part of two-phase draining- tasks will still be running on these hosts until
+    drain_hosts is called upon them.
+
+    :param hostnames: List of hosts to set for initial maintenance
+    :type hostnames: list of strings
+    """
+    check_and_log_response(self._client.start_maintenance(Hosts(set(hostnames))))
 
-  def perform_maintenance(self, hosts, grouping_function=DEFAULT_GROUPING,
+  def perform_maintenance(self, hostnames, grouping_function=DEFAULT_GROUPING,
                           callback=None):
-    """The wrap a callback in between sending hosts into maintenance mode and back.
+    """Wrap a callback in between sending hosts into maintenance mode and back.
 
     Walk through the process of putting hosts into maintenance, draining them of tasks,
     performing an action on them once drained, then removing them from maintenance mode
     so tasks can schedule.
+
+    :param hostnames: A list of hosts to operate upon
+    :type hostnames: list of strings
+    :param groups_per_batch: Number of groups (by default, hosts) to operate on at once
+    :type groups_per_batch: int
+    :param grouping_function: How to split up the hostname into groups
+    :type grouping_function: function
+    :param callback: Function to call once hosts are drained
+    :type callback: function
     """
-    self._complete_maintenance(Hosts(set(hosts)))
-    self.start_maintenance(hosts)
+    self.start_maintenance(hostnames)
 
-    for hosts in self.iter_batches(hosts, grouping_function):
+    for hosts in self.iter_batches(hostnames, grouping_function):
       self._drain_hosts(hosts)
       if callback:
         self._operate_on_hosts(hosts, callback)
       self._complete_maintenance(hosts)
 
-  def check_status(self, hosts):
-    resp = self._client.maintenance_status(Hosts(set(hosts)))
+  def check_status(self, hostnames):
+    """Query the scheduler to determine the maintenance status for a list of hostnames
+
+    :param hostnames: Hosts to query for
+    :type hostnames: list of strings
+    :rtype: list of 2-tuples, hostname and MaintenanceMode
+    """
+    resp = self._client.maintenance_status(Hosts(set(hostnames)))
     check_and_log_response(resp)
     statuses = []
     for host_status in resp.result.maintenanceStatusResult.statuses:

http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/6feda1cc/src/main/python/apache/aurora/client/base.py
----------------------------------------------------------------------
diff --git a/src/main/python/apache/aurora/client/base.py b/src/main/python/apache/aurora/client/base.py
index 6716ba6..3115540 100644
--- a/src/main/python/apache/aurora/client/base.py
+++ b/src/main/python/apache/aurora/client/base.py
@@ -142,6 +142,19 @@ def get_grouping_or_die(grouping_function):
 
 
 def group_hosts(hostnames, grouping_function=DEFAULT_GROUPING):
+  """Place a list of hosts into batches to be operated upon.
+
+  By default, the grouping function is 'by host' which means that maintenance will
+  operate on a single hostname at a time. By adding more grouping functions,
+  a site can setup a customized way of specifying groups, such as operating on a single
+  rack of hosts at a time.
+
+  :param hostnames: Hostnames to break into groups
+  :type hostnames: list of host names, must match the host names that slaves are registered
with
+  :param grouping_function: Key within GROUPING_FUNCTIONS to partition hosts into desired
batches
+  :type grouping_function: string
+  :rtype: dictionary of batches
+  """
   grouping_function = get_grouping_or_die(grouping_function)
   groups = defaultdict(set)
   for hostname in hostnames:

http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/6feda1cc/src/test/python/apache/aurora/admin/BUILD
----------------------------------------------------------------------
diff --git a/src/test/python/apache/aurora/admin/BUILD b/src/test/python/apache/aurora/admin/BUILD
index 943995a..20f744b 100644
--- a/src/test/python/apache/aurora/admin/BUILD
+++ b/src/test/python/apache/aurora/admin/BUILD
@@ -22,6 +22,9 @@ python_tests(name = 'host_maintenance',
   sources = ['test_host_maintenance.py'],
   dependencies = [
     pants('3rdparty/python:mock'),
+    pants('3rdparty/python:twitter.common.log'),
+    pants('3rdparty/python:twitter.common.quantity'),
+    pants('src/main/python/apache/aurora/client:api'),
     pants('src/main/python/apache/aurora/common:cluster'),
     pants('src/main/python/apache/aurora/admin:host_maintenance'),
     pants('src/main/thrift/org/apache/aurora/gen:py-thrift'),

http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/6feda1cc/src/test/python/apache/aurora/admin/test_host_maintenance.py
----------------------------------------------------------------------
diff --git a/src/test/python/apache/aurora/admin/test_host_maintenance.py b/src/test/python/apache/aurora/admin/test_host_maintenance.py
index 176076f..0341d35 100644
--- a/src/test/python/apache/aurora/admin/test_host_maintenance.py
+++ b/src/test/python/apache/aurora/admin/test_host_maintenance.py
@@ -12,29 +12,172 @@
 # limitations under the License.
 #
 
+import copy
+import time
 import unittest
 
 import mock
+from twitter.common import log
+from twitter.common.quantity import Time
 
 from apache.aurora.admin.host_maintenance import HostMaintenance
+from apache.aurora.client.api import AuroraClientAPI
 from apache.aurora.client.base import add_grouping, remove_grouping
 from apache.aurora.common.cluster import Cluster
 
-from gen.apache.aurora.api.ttypes import Hosts, Response, ResponseCode
+from gen.apache.aurora.api.ttypes import (
+    Hosts,
+    HostStatus,
+    MaintenanceMode,
+    MaintenanceStatusResult,
+    Response,
+    ResponseCode,
+    Result
+)
 
 DEFAULT_CLUSTER = Cluster(
     name='us-west',
     scheduler_uri='us-west-234.example.com:8888',
 )
-MOCK_TEST_HOSTS = ['us-west-001.example.com']
+TEST_HOSTNAMES = [
+    'us-west-001.example.com',
+    'us-west-002.example.com',
+    'us-west-003.example.com']
 
 
 class TestHostMaintenance(unittest.TestCase):
-  @mock.patch("apache.aurora.client.api.AuroraClientAPI.start_maintenance")
+  @mock.patch("apache.aurora.client.api.AuroraClientAPI.maintenance_status",
+      spec=AuroraClientAPI.maintenance_status)
+  @mock.patch("apache.aurora.client.api.AuroraClientAPI.drain_hosts",
+      spec=AuroraClientAPI.drain_hosts)
+  def test_drain_hosts(self, mock_drain_hosts, mock_maintenance_status):
+    fake_maintenance_status_response = [
+        Response(result=Result(maintenanceStatusResult=MaintenanceStatusResult(set([
+            HostStatus(host=TEST_HOSTNAMES[0], mode=MaintenanceMode.SCHEDULED),
+            HostStatus(host=TEST_HOSTNAMES[1], mode=MaintenanceMode.SCHEDULED),
+            HostStatus(host=TEST_HOSTNAMES[2], mode=MaintenanceMode.SCHEDULED)
+        ])))),
+        Response(result=Result(maintenanceStatusResult=MaintenanceStatusResult(set([
+            HostStatus(host=TEST_HOSTNAMES[0], mode=MaintenanceMode.DRAINING),
+            HostStatus(host=TEST_HOSTNAMES[1], mode=MaintenanceMode.DRAINING),
+            HostStatus(host=TEST_HOSTNAMES[2], mode=MaintenanceMode.DRAINING)
+        ])))),
+        Response(result=Result(maintenanceStatusResult=MaintenanceStatusResult(set([
+            HostStatus(host=TEST_HOSTNAMES[0], mode=MaintenanceMode.DRAINING),
+            HostStatus(host=TEST_HOSTNAMES[1], mode=MaintenanceMode.DRAINED),
+            HostStatus(host=TEST_HOSTNAMES[2], mode=MaintenanceMode.DRAINED)
+        ])))),
+        Response(result=Result(maintenanceStatusResult=MaintenanceStatusResult(set([
+            HostStatus(host=TEST_HOSTNAMES[0], mode=MaintenanceMode.DRAINED)
+        ]))))
+    ]
+    fake_maintenance_status_call_args = []
+    def fake_maintenance_status_side_effect(hosts):
+      fake_maintenance_status_call_args.append(copy.deepcopy(hosts))
+      return fake_maintenance_status_response.pop(0)
+
+    clock = mock.Mock(time)
+    mock_drain_hosts.return_value = Response(responseCode=ResponseCode.OK)
+    mock_maintenance_status.side_effect = fake_maintenance_status_side_effect
+    test_hosts = Hosts(set(TEST_HOSTNAMES))
+    maintenance = HostMaintenance(DEFAULT_CLUSTER, 'quiet')
+    maintenance._drain_hosts(test_hosts, clock)
+    mock_drain_hosts.assert_called_once_with(test_hosts)
+    assert clock.sleep.call_count == 4
+    assert clock.sleep.call_args == mock.call(
+        HostMaintenance.START_MAINTENANCE_DELAY.as_(Time.SECONDS))
+    assert mock_maintenance_status.call_count == 4
+    assert fake_maintenance_status_call_args == [
+        (Hosts(set(TEST_HOSTNAMES))),
+        (Hosts(set(TEST_HOSTNAMES))),
+        (Hosts(set(TEST_HOSTNAMES))),
+        (Hosts(set([TEST_HOSTNAMES[0]])))]
+
+  @mock.patch("twitter.common.log.warning", spec=log.warning)
+  @mock.patch("apache.aurora.client.api.AuroraClientAPI.maintenance_status",
+      spec=AuroraClientAPI.maintenance_status)
+  @mock.patch("apache.aurora.client.api.AuroraClientAPI.end_maintenance",
+      spec=AuroraClientAPI.end_maintenance)
+  def test_complete_maintenance(self, mock_end_maintenance, mock_maintenance_status, mock_warning):
+    mock_maintenance_status.return_value = Response(result=Result(
+        maintenanceStatusResult=MaintenanceStatusResult(set([
+            HostStatus(host=TEST_HOSTNAMES[0], mode=MaintenanceMode.NONE),
+            HostStatus(host=TEST_HOSTNAMES[1], mode=MaintenanceMode.NONE),
+            HostStatus(host=TEST_HOSTNAMES[2], mode=MaintenanceMode.DRAINED)
+        ]))
+    ))
+    mock_end_maintenance.return_value = Response(responseCode=ResponseCode.OK)
+    test_hosts = Hosts(set(TEST_HOSTNAMES))
+    maintenance = HostMaintenance(DEFAULT_CLUSTER, 'quiet')
+    maintenance._complete_maintenance(test_hosts)
+    mock_end_maintenance.assert_called_once_with(test_hosts)
+    mock_maintenance_status.assert_called_once_with(test_hosts)
+    mock_warning.assert_called_once_with('%s is DRAINING or in DRAINED' % TEST_HOSTNAMES[2])
+
+  def test_operate_on_hosts(self):
+    mock_callback = mock.Mock()
+    test_hosts = Hosts(TEST_HOSTNAMES)
+    maintenance = HostMaintenance(DEFAULT_CLUSTER, 'quiet')
+    maintenance._operate_on_hosts(test_hosts, mock_callback)
+    assert mock_callback.call_count == 3
+
+  @mock.patch("apache.aurora.admin.host_maintenance.HostMaintenance._complete_maintenance",
+    spec=HostMaintenance._complete_maintenance)
+  def test_end_maintenance(self, mock_complete_maintenance):
+    maintenance = HostMaintenance(DEFAULT_CLUSTER, 'quiet')
+    maintenance.end_maintenance(TEST_HOSTNAMES)
+    mock_complete_maintenance.assert_called_once_with(Hosts(set(TEST_HOSTNAMES)))
+
+  @mock.patch("apache.aurora.client.api.AuroraClientAPI.start_maintenance",
+      spec=AuroraClientAPI.start_maintenance)
   def test_start_maintenance(self, mock_api):
     mock_api.return_value = Response(responseCode=ResponseCode.OK)
     maintenance = HostMaintenance(DEFAULT_CLUSTER, 'quiet')
-    maintenance.start_maintenance(MOCK_TEST_HOSTS)
+    maintenance.start_maintenance(TEST_HOSTNAMES)
+    mock_api.assert_called_once_with(Hosts(set(TEST_HOSTNAMES)))
+
+  @mock.patch("apache.aurora.admin.host_maintenance.HostMaintenance._complete_maintenance",
+    spec=HostMaintenance._complete_maintenance)
+  @mock.patch("apache.aurora.admin.host_maintenance.HostMaintenance._operate_on_hosts",
+    spec=HostMaintenance._operate_on_hosts)
+  @mock.patch("apache.aurora.admin.host_maintenance.HostMaintenance._drain_hosts",
+    spec=HostMaintenance._drain_hosts)
+  @mock.patch("apache.aurora.admin.host_maintenance.HostMaintenance.start_maintenance",
+    spec=HostMaintenance.start_maintenance)
+  def test_perform_maintenance(self, mock_start_maintenance, mock_drain_hosts,
+      mock_operate_on_hosts, mock_complete_maintenance):
+    mock_callback = mock.Mock()
+    maintenance = HostMaintenance(DEFAULT_CLUSTER, 'quiet')
+    maintenance.perform_maintenance(TEST_HOSTNAMES, callback=mock_callback)
+    mock_start_maintenance.assert_called_once_with(TEST_HOSTNAMES)
+    assert mock_drain_hosts.call_count == 3
+    assert mock_drain_hosts.call_args_list == [
+        mock.call(Hosts(set([hostname]))) for hostname in TEST_HOSTNAMES]
+    assert mock_operate_on_hosts.call_count == 3
+    assert mock_operate_on_hosts.call_args_list == [
+        mock.call(Hosts(set([hostname])), mock_callback) for hostname in TEST_HOSTNAMES]
+    assert mock_complete_maintenance.call_count == 3
+    assert mock_complete_maintenance.call_args_list == [
+        mock.call(Hosts(set([hostname]))) for hostname in TEST_HOSTNAMES]
+
+  @mock.patch("apache.aurora.client.api.AuroraClientAPI.maintenance_status",
+      spec=AuroraClientAPI.maintenance_status)
+  def test_check_status(self, mock_maintenance_status):
+    mock_maintenance_status.return_value = Response(responseCode=ResponseCode.OK, result=Result(
+        maintenanceStatusResult=MaintenanceStatusResult(set([
+            HostStatus(host=TEST_HOSTNAMES[0], mode=MaintenanceMode.DRAINING),
+            HostStatus(host=TEST_HOSTNAMES[1], mode=MaintenanceMode.DRAINED),
+            HostStatus(host=TEST_HOSTNAMES[2], mode=MaintenanceMode.NONE)
+        ]))
+    ))
+    maintenance = HostMaintenance(DEFAULT_CLUSTER, 'quiet')
+    statuses = maintenance.check_status(TEST_HOSTNAMES)
+    mock_maintenance_status.assert_called_once_with(Hosts(set(TEST_HOSTNAMES)))
+    assert statuses == [
+        (TEST_HOSTNAMES[0], MaintenanceMode._VALUES_TO_NAMES[MaintenanceMode.DRAINING]),
+        (TEST_HOSTNAMES[1], MaintenanceMode._VALUES_TO_NAMES[MaintenanceMode.DRAINED]),
+        (TEST_HOSTNAMES[2], MaintenanceMode._VALUES_TO_NAMES[MaintenanceMode.NONE])
+    ]
 
 
 def test_default_grouping():

http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/6feda1cc/src/test/python/apache/aurora/client/commands/test_maintenance.py
----------------------------------------------------------------------
diff --git a/src/test/python/apache/aurora/client/commands/test_maintenance.py b/src/test/python/apache/aurora/client/commands/test_maintenance.py
index dd56b8d..827bd7f 100644
--- a/src/test/python/apache/aurora/client/commands/test_maintenance.py
+++ b/src/test/python/apache/aurora/client/commands/test_maintenance.py
@@ -116,7 +116,6 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
   def test_perform_maintenance_hosts(self):
     mock_options = self.make_mock_options()
     mock_options.post_drain_script = None
-    mock_options.groups_per_batch = '1'
     mock_options.grouping = 'by_host'
 
     def host_status_results(hostnames):
@@ -144,9 +143,9 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
       mock_scheduler_proxy.startMaintenance.assert_called_with(Hosts(set(self.HOSTNAMES)))
       #TODO(jsmith): Consider not mocking out sleep and instead refactoring
       assert mock_sleep.call_count == 3
-      assert mock_scheduler_proxy.maintenanceStatus.call_count == 7
+      assert mock_scheduler_proxy.maintenanceStatus.call_count == 6
       assert mock_scheduler_proxy.drainHosts.call_count == 3
-      assert mock_scheduler_proxy.endMaintenance.call_count == 4
+      assert mock_scheduler_proxy.endMaintenance.call_count == 3
 
   def test_host_maintenance_status(self):
     mock_options = self.make_mock_options()


Mime
View raw message