aurora-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ma...@apache.org
Subject git commit: Adding SLA check into perform_maintenance_hosts command.
Date Wed, 11 Jun 2014 19:51:04 GMT
Repository: incubator-aurora
Updated Branches:
  refs/heads/master 2ed7d8d87 -> ea68ade10


Adding SLA check into perform_maintenance_hosts command.

Bugs closed: AURORA-445

Reviewed at https://reviews.apache.org/r/22167/


Project: http://git-wip-us.apache.org/repos/asf/incubator-aurora/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-aurora/commit/ea68ade1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-aurora/tree/ea68ade1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-aurora/diff/ea68ade1

Branch: refs/heads/master
Commit: ea68ade108be4ca0669add73d51832e8db94c387
Parents: 2ed7d8d
Author: Maxim Khutornenko <maxim@apache.org>
Authored: Wed Jun 11 12:50:27 2014 -0700
Committer: Maxim Khutornenko <maxim@apache.org>
Committed: Wed Jun 11 12:50:27 2014 -0700

----------------------------------------------------------------------
 src/main/python/apache/aurora/admin/BUILD       |  10 +-
 .../python/apache/aurora/admin/admin_util.py    | 183 +++++++++++++++++++
 .../apache/aurora/admin/host_maintenance.py     |  52 +++++-
 src/main/python/apache/aurora/client/base.py    |  52 ------
 .../python/apache/aurora/client/cli/__init__.py |   4 +-
 .../python/apache/aurora/client/commands/BUILD  |   1 +
 .../apache/aurora/client/commands/admin.py      |  45 ++---
 .../aurora/client/commands/maintenance.py       |  63 ++++---
 .../aurora/admin/test_host_maintenance.py       |  20 +-
 .../aurora/client/commands/test_admin_sla.py    |   4 +-
 .../aurora/client/commands/test_maintenance.py  |  98 ++++++++--
 .../apache/aurora/client/commands/util.py       |  18 ++
 12 files changed, 420 insertions(+), 130 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/ea68ade1/src/main/python/apache/aurora/admin/BUILD
----------------------------------------------------------------------
diff --git a/src/main/python/apache/aurora/admin/BUILD b/src/main/python/apache/aurora/admin/BUILD
index 8d850bc..637f4e6 100644
--- a/src/main/python/apache/aurora/admin/BUILD
+++ b/src/main/python/apache/aurora/admin/BUILD
@@ -16,10 +16,18 @@ python_library(
   name = 'host_maintenance',
   sources = 'host_maintenance.py',
   dependencies = [
+    pants(':util'),
     pants('3rdparty/python:twitter.common.log'),
     pants('3rdparty/python:twitter.common.quantity'),
     pants('src/main/python/apache/aurora/client:api'),
-    pants('src/main/python/apache/aurora/client:base'),
     pants('src/main/thrift/org/apache/aurora/gen:py-thrift'),
   ]
 )
+
+python_library(
+  name = 'util',
+  sources = 'admin_util.py',
+  dependencies = [
+    pants('src/main/python/apache/aurora/client:base'),
+  ]
+)

http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/ea68ade1/src/main/python/apache/aurora/admin/admin_util.py
----------------------------------------------------------------------
diff --git a/src/main/python/apache/aurora/admin/admin_util.py b/src/main/python/apache/aurora/admin/admin_util.py
new file mode 100644
index 0000000..d8517e9
--- /dev/null
+++ b/src/main/python/apache/aurora/admin/admin_util.py
@@ -0,0 +1,183 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import getpass
+import logging
+import optparse
+import os
+import subprocess
+from uuid import uuid1
+
+from apache.aurora.client.base import die
+
+"""Admin client utility functions shared between admin and maintenance modules."""
+
+# TODO(maxim): Switch to CLI ConfigurationPlugin within AURORA-486.
+LOGGER_NAME = 'aurora_admin'
+logger = logging.getLogger(LOGGER_NAME)
+CLIENT_ID = uuid1()
+
+
+def log_admin_message(sev, msg, *args, **kwargs):
+  """Logs message using the module-defined logger.
+
+   :param sev: message severity
+   :type sev: The numeric level of the logging event (one of DEBUG, INFO etc.)
+   :param msg: message to log
+   :type msg: string
+  """
+  extra = kwargs.get('extra', {})
+  extra['clientid'] = CLIENT_ID
+  extra['user'] = getpass.getuser()
+  extra['logger_name'] = LOGGER_NAME
+  kwargs['extra'] = extra
+  logger.log(sev, msg, *args, **kwargs)
+
+
+FILENAME_OPTION = optparse.Option(
+  '--filename',
+  dest='filename',
+  default=None,
+  help='Name of the file with hostnames')
+
+
+HOSTS_OPTION = optparse.Option(
+  '--hosts',
+  dest='hosts',
+  default=None,
+  help='Comma separated list of hosts')
+
+
+def parse_sla_percentage(percentage):
+  """Parses percentage value for an SLA check.
+
+  :param percentage: string percentage to parse
+  :type percentage: string
+  :rtype: float
+  """
+  val = float(percentage)
+  if val <= 0 or val > 100:
+    die('Invalid percentage %s. Must be within (0, 100].' % percentage)
+  return val
+
+
+def _parse_hostname_list(hostname_list):
+  hostnames = [hostname.strip() for hostname in hostname_list.split(",")]
+  if not hostnames:
+    die('No valid hosts found.')
+  return hostnames
+
+
+def _parse_hostname_file(filename):
+  with open(filename, 'r') as hosts:
+    hostnames = [hostname.strip() for hostname in hosts]
+  if not hostnames:
+    die('No valid hosts found in %s.' % filename)
+  return hostnames
+
+
+def parse_hostnames_optional(list_option, file_option):
+  """Parses host names from a comma-separated list or a filename.
+
+  Does not require either of the arguments (returns None list if no option provided).
+
+  :param list_option: command option with comma-separated list of host names
+  :type list_option: app.option
+  :param file_option: command option with filename (one host per line)
+  :type file_option: app.option
+  :rtype: list of host names or None.
+  """
+  if bool(list_option) and bool(file_option):
+    die('Cannot specify both filename and list for the same option.')
+  hostnames = None
+  if file_option:
+    hostnames = _parse_hostname_file(file_option)
+  elif list_option:
+    hostnames = _parse_hostname_list(list_option)
+  return hostnames
+
+
+def parse_hostnames(filename, hostnames):
+  """Parses host names from a comma-separated list or a filename.
+
+  Fails if neither filename nor hostnames provided.
+
+  :param filename: filename with host names (one per line)
+  :type filename: string
+  :param hostnames: comma-separated list of host names
+  :type hostnames: string
+  :rtype: list of host names
+  """
+  if bool(filename) == bool(hostnames):
+    die('Please specify either --filename or --hosts')
+  if filename:
+    hostnames = _parse_hostname_file(filename)
+  elif hostnames:
+    hostnames = _parse_hostname_list(hostnames)
+  if not hostnames:
+    die('No valid hosts found.')
+  return hostnames
+
+
+def parse_script(filename):
+  """Parses shell script from the provided file and wraps it up into a subprocess callback.
+
+  :param filename: name of the script file
+  :type filename: string
+  :rtype: function
+  """
+  if filename:
+    if not os.path.exists(filename):
+      die("No such file: %s" % filename)
+    cmd = os.path.abspath(filename)
+    return lambda host: subprocess.Popen([cmd, host])
+  else:
+    return None
+
+
+def print_results(results):
+  """Prints formatted SLA results.
+
+  :param results: formatted SLA results
+  :type results: list of string
+  """
+  for line in results:
+    print(line)
+
+
+def format_sla_results(host_groups, unsafe_only=False):
+  """Formats SLA check result output.
+
+  :param host_groups: SLA check result groups (grouped by external grouping criteria, e.g.
by_host)
+  :type host_groups: list of (defaultdict(list))
+  :param unsafe_only: If True, includes only SLA-"unsafe" hosts from the results
+  :type unsafe_only: bool
+  :rtype: list of string
+  """
+  results = []
+  include_unsafe_only = lambda d: not d.safe if unsafe_only else True
+
+  for group in host_groups:
+    for host, job_details in sorted(group.items()):
+      host_details = '\n'.join(
+          ['%s\t%s\t%.2f\t%s\t%s' %
+              (host,
+               d.job.to_path(),
+               d.predicted_percentage,
+               d.safe,
+               'n/a' if d.safe_in_secs is None else d.safe_in_secs)
+              for d in sorted(job_details) if include_unsafe_only(d)])
+      if host_details:
+        results.append(host_details)
+  return results

http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/ea68ade1/src/main/python/apache/aurora/admin/host_maintenance.py
----------------------------------------------------------------------
diff --git a/src/main/python/apache/aurora/admin/host_maintenance.py b/src/main/python/apache/aurora/admin/host_maintenance.py
index 71f27bf..97d484f 100644
--- a/src/main/python/apache/aurora/admin/host_maintenance.py
+++ b/src/main/python/apache/aurora/admin/host_maintenance.py
@@ -17,6 +17,7 @@ import time
 from twitter.common import log
 from twitter.common.quantity import Amount, Time
 
+from apache.aurora.admin.admin_util import format_sla_results, print_results
 from apache.aurora.client.api import AuroraClientAPI
 from apache.aurora.client.base import check_and_log_response, DEFAULT_GROUPING, group_hosts
 
@@ -38,6 +39,11 @@ class HostMaintenance(object):
 
   START_MAINTENANCE_DELAY = Amount(30, Time.SECONDS)
 
+  SLA_MIN_JOB_INSTANCE_COUNT = 20
+  SLA_UPTIME_PERCENTAGE_LIMIT = 95
+  SLA_UPTIME_DURATION_LIMIT = Amount(30, Time.MINUTES)
+
+
   @classmethod
   def iter_batches(cls, hostnames, grouping_function=DEFAULT_GROUPING):
     groups = group_hosts(hostnames, grouping_function)
@@ -97,6 +103,46 @@ class HostMaintenance(object):
     for hostname in drained_hosts.hostNames:
       callback(hostname)
 
+  def _check_sla(self, hostnames, grouping_function, percentage=None, duration=None):
+    """Check if the provided list of hosts passes the job uptime SLA check.
+
+    This is an all-or-nothing check, meaning that all provided hosts must pass their job
+    SLA check for the maintenance to proceed.
+
+    :param hostnames: list of host names to check SLA for
+    :type hostnames: list of strings
+    :param grouping_function: grouping function to apply to the given hosts
+    :type grouping_function: function
+    :param percentage: SLA uptime percentage override
+    :type percentage: float
+    :param duration: SLA uptime duration override
+    :type duration: twitter.common.quantity.Amount
+    :rtype: True if all hosts pass SLA check, False otherwise.
+    """
+    sla_percentage = percentage or self.SLA_UPTIME_PERCENTAGE_LIMIT
+    sla_duration = duration or self.SLA_UPTIME_DURATION_LIMIT
+
+    vector = self._client.sla_get_safe_domain_vector(self.SLA_MIN_JOB_INSTANCE_COUNT, hostnames)
+    host_groups = vector.probe_hosts(
+      sla_percentage,
+      sla_duration.as_(Time.SECONDS),
+      grouping_function)
+
+    # Given that maintenance is performed 1 group at a time, any result longer than 1 group
+    # should be considered a batch failure.
+    if host_groups:
+      if len(host_groups) > 1:
+        log.error('Illegal multiple groups detected in SLA results. Skipping hosts:%s' %
hostnames)
+        return False
+
+      results = format_sla_results(host_groups, unsafe_only=True)
+      if results:
+        print_results(results)
+        log.warning('Some hosts in a group did not pass SLA check. Skipping group:%s' % hostnames)
+        return False
+
+    return True
+
   def end_maintenance(self, hostnames):
     """Pull a list of hostnames out of maintenance mode.
 
@@ -117,7 +163,7 @@ class HostMaintenance(object):
     check_and_log_response(self._client.start_maintenance(Hosts(set(hostnames))))
 
   def perform_maintenance(self, hostnames, grouping_function=DEFAULT_GROUPING,
-                          callback=None):
+                          callback=None, percentage=None, duration=None):
     """Wrap a callback in between sending hosts into maintenance mode and back.
 
     Walk through the process of putting hosts into maintenance, draining them of tasks,
@@ -136,6 +182,10 @@ class HostMaintenance(object):
     self.start_maintenance(hostnames)
 
     for hosts in self.iter_batches(hostnames, grouping_function):
+      if not self._check_sla(list(hosts.hostNames), grouping_function, percentage, duration):
+        self._complete_maintenance(hosts)
+        continue
+
       self._drain_hosts(hosts)
       if callback:
         self._operate_on_hosts(hosts, callback)

http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/ea68ade1/src/main/python/apache/aurora/client/base.py
----------------------------------------------------------------------
diff --git a/src/main/python/apache/aurora/client/base.py b/src/main/python/apache/aurora/client/base.py
index 3115540..663a247 100644
--- a/src/main/python/apache/aurora/client/base.py
+++ b/src/main/python/apache/aurora/client/base.py
@@ -101,20 +101,6 @@ class requires(object):  # noqa
     return real_fn
 
 
-FILENAME_OPTION = optparse.Option(
-    '--filename',
-    dest='filename',
-    default=None,
-    help='Name of the file with hostnames')
-
-
-HOSTS_OPTION = optparse.Option(
-    '--hosts',
-    dest='hosts',
-    default=None,
-    help='Comma separated list of hosts')
-
-
 def group_by_host(hostname):
   return hostname
 
@@ -172,44 +158,6 @@ GROUPING_OPTION = optparse.Option(
         ', '.join(GROUPING_FUNCTIONS.keys())))
 
 
-def parse_host_list(host_list):
-  hosts = [hostname.strip() for hostname in host_list.split(",")]
-  if not hosts:
-    die('No valid hosts found.')
-  return hosts
-
-
-def parse_host_file(filename):
-  with open(filename, 'r') as hosts:
-    hosts = [hostname.strip() for hostname in hosts]
-  if not hosts:
-    die('No valid hosts found in %s.' % filename)
-  return hosts
-
-
-def parse_hosts_optional(list_option, file_option):
-  if bool(list_option) and bool(file_option):
-    die('Cannot specify both filename and list for the same option.')
-  hosts = None
-  if file_option:
-    hosts = parse_host_file(file_option)
-  elif list_option:
-    hosts = parse_host_list(list_option)
-  return hosts
-
-
-def parse_hosts(filename, hosts):
-  if bool(filename) == bool(hosts):
-    die('Please specify either --filename or --hosts')
-  if filename:
-    hosts = parse_host_file(filename)
-  elif hosts:
-    hosts = parse_host_list(hosts)
-  if not hosts:
-    die('No valid hosts found.')
-  return hosts
-
-
 def synthesize_url(scheduler_url, role=None, env=None, job=None):
   if not scheduler_url:
     log.warning("Unable to find scheduler web UI!")

http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/ea68ade1/src/main/python/apache/aurora/client/cli/__init__.py
----------------------------------------------------------------------
diff --git a/src/main/python/apache/aurora/client/cli/__init__.py b/src/main/python/apache/aurora/client/cli/__init__.py
index 0596daa..827bbb8 100644
--- a/src/main/python/apache/aurora/client/cli/__init__.py
+++ b/src/main/python/apache/aurora/client/cli/__init__.py
@@ -63,7 +63,8 @@ EXIT_UNKNOWN_ERROR = 20
 # invocation, and "user", which contains the username of the user who invoked
 # the client.
 
-logger = logging.getLogger("aurora_client")
+LOGGER_NAME = "aurora_client"
+logger = logging.getLogger(LOGGER_NAME)
 CLIENT_ID = uuid1()
 
 
@@ -77,6 +78,7 @@ def print_aurora_log(sev, msg, *args, **kwargs):
   extra = kwargs.get("extra", {})
   extra["clientid"] = CLIENT_ID
   extra["user"] = getpass.getuser()
+  extra["logger_name"] = LOGGER_NAME
   kwargs["extra"] = extra
   logger.log(sev, msg, *args, **kwargs)
 

http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/ea68ade1/src/main/python/apache/aurora/client/commands/BUILD
----------------------------------------------------------------------
diff --git a/src/main/python/apache/aurora/client/commands/BUILD b/src/main/python/apache/aurora/client/commands/BUILD
index 03cd485..cc16923 100644
--- a/src/main/python/apache/aurora/client/commands/BUILD
+++ b/src/main/python/apache/aurora/client/commands/BUILD
@@ -29,6 +29,7 @@ python_library(
     pants('3rdparty/python:twitter.common.app'),
     pants('3rdparty/python:twitter.common.log'),
     pants('3rdparty/python:twitter.common.quantity'),
+    pants('src/main/python/apache/aurora/admin:util'),
     pants('src/main/python/apache/aurora/client/api'),
     pants('src/main/python/apache/aurora/client:base'),
     pants('src/main/python/apache/aurora/common:clusters'),

http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/ea68ade1/src/main/python/apache/aurora/client/commands/admin.py
----------------------------------------------------------------------
diff --git a/src/main/python/apache/aurora/client/commands/admin.py b/src/main/python/apache/aurora/client/commands/admin.py
index a9f81e8..02d055f 100644
--- a/src/main/python/apache/aurora/client/commands/admin.py
+++ b/src/main/python/apache/aurora/client/commands/admin.py
@@ -22,17 +22,22 @@ from twitter.common import app, log
 from twitter.common.quantity import Amount, Data, Time
 from twitter.common.quantity.parse_simple import parse_data, parse_time
 
+from apache.aurora.admin.admin_util import (
+    FILENAME_OPTION,
+    format_sla_results,
+    HOSTS_OPTION,
+    parse_hostnames,
+    parse_hostnames_optional,
+    parse_sla_percentage,
+    print_results
+)
 from apache.aurora.client.api import AuroraClientAPI
 from apache.aurora.client.api.sla import JobUpTimeLimit
 from apache.aurora.client.base import (
     check_and_log_response,
     die,
-    FILENAME_OPTION,
     get_grouping_or_die,
     GROUPING_OPTION,
-    HOSTS_OPTION,
-    parse_hosts,
-    parse_hosts_optional,
     requires
 )
 from apache.aurora.common.aurora_job_key import AuroraJobKey
@@ -54,18 +59,6 @@ MIN_SLA_INSTANCE_COUNT = optparse.Option(
 )
 
 
-def print_results(results):
-  for line in results:
-    print(line)
-
-
-def parse_sla_percentage(percentage):
-  val = float(percentage)
-  if val <= 0 or val > 100:
-    die('Invalid percentage %s. Must be within (0, 100].' % percentage)
-  return val
-
-
 @app.command
 @app.command_option('--force', dest='force', default=False, action='store_true',
     help='Force expensive queries to run.')
@@ -407,8 +400,8 @@ def sla_list_safe_domain(cluster, percentage, duration):
   sla_percentage = parse_sla_percentage(percentage)
   sla_duration = parse_time(duration)
 
-  exclude_hosts = parse_hosts_optional(options.exclude_hosts, options.exclude_filename)
-  include_hosts = parse_hosts_optional(options.include_hosts, options.include_filename)
+  exclude_hosts = parse_hostnames_optional(options.exclude_hosts, options.exclude_filename)
+  include_hosts = parse_hostnames_optional(options.include_hosts, options.include_filename)
   override_jobs = parse_jobs_file(options.override_filename) if options.override_filename
else {}
   get_grouping_or_die(options.grouping)
 
@@ -466,7 +459,7 @@ def sla_probe_hosts(cluster, percentage, duration):
 
   sla_percentage = parse_sla_percentage(percentage)
   sla_duration = parse_time(duration)
-  hosts = parse_hosts(options.filename, options.hosts)
+  hosts = parse_hostnames(options.filename, options.hosts)
   get_grouping_or_die(options.grouping)
 
   vector = AuroraClientAPI(
@@ -474,19 +467,7 @@ def sla_probe_hosts(cluster, percentage, duration):
       options.verbosity).sla_get_safe_domain_vector(options.min_instance_count, hosts)
   groups = vector.probe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), options.grouping)
 
-  results = []
-  for group in groups:
-    for host, job_details in sorted(group.items()):
-      results.append('\n'.join(
-          ['%s\t%s\t%.2f\t%s\t%s' %
-              (host,
-               d.job.to_path(),
-               d.predicted_percentage,
-               d.safe,
-               'n/a' if d.safe_in_secs is None else d.safe_in_secs)
-              for d in sorted(job_details)]))
-
-  print_results(results)
+  print_results(format_sla_results(groups))
 
 
 @app.command

http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/ea68ade1/src/main/python/apache/aurora/client/commands/maintenance.py
----------------------------------------------------------------------
diff --git a/src/main/python/apache/aurora/client/commands/maintenance.py b/src/main/python/apache/aurora/client/commands/maintenance.py
index e4d60d4..e2ac59a 100644
--- a/src/main/python/apache/aurora/client/commands/maintenance.py
+++ b/src/main/python/apache/aurora/client/commands/maintenance.py
@@ -12,21 +12,21 @@
 # limitations under the License.
 #
 
-import os
-import subprocess
+import logging
 
 from twitter.common import app, log
+from twitter.common.quantity.parse_simple import parse_time
 
-from apache.aurora.admin.host_maintenance import HostMaintenance
-from apache.aurora.client.base import (
-    die,
+from apache.aurora.admin.admin_util import (
     FILENAME_OPTION,
-    get_grouping_or_die,
-    GROUPING_OPTION,
     HOSTS_OPTION,
-    parse_hosts,
-    requires
+    log_admin_message,
+    parse_hostnames,
+    parse_script,
+    parse_sla_percentage
 )
+from apache.aurora.admin.host_maintenance import HostMaintenance
+from apache.aurora.client.base import die, get_grouping_or_die, GROUPING_OPTION, requires
 from apache.aurora.common.clusters import CLUSTERS
 
 
@@ -40,7 +40,7 @@ def start_maintenance_hosts(cluster):
   """
   options = app.get_options()
   HostMaintenance(CLUSTERS[cluster], options.verbosity).start_maintenance(
-      parse_hosts(options.filename, options.hosts))
+      parse_hostnames(options.filename, options.hosts))
 
 
 @app.command
@@ -53,12 +53,24 @@ def end_maintenance_hosts(cluster):
   """
   options = app.get_options()
   HostMaintenance(CLUSTERS[cluster], options.verbosity).end_maintenance(
-      parse_hosts(options.filename, options.hosts))
+      parse_hostnames(options.filename, options.hosts))
 
 
 @app.command
 @app.command_option('--post_drain_script', dest='post_drain_script', default=None,
     help='Path to a script to run for each host.')
+@app.command_option('--override_percentage', dest='percentage', default=None,
+    help='Percentage of tasks required to be up all the time within the duration. '
+         'Default value:%s. DO NOT override default value unless absolutely necessary! '
+         'See sla_probe_hosts and sla_list_safe_domain commands '
+         'for more details on SLA.' % HostMaintenance.SLA_UPTIME_PERCENTAGE_LIMIT)
+@app.command_option('--override_duration', dest='duration', default=None,
+    help='Time interval (now - value) for the percentage of up tasks. Format: XdYhZmWs. '
+         'Default value:%s. DO NOT override default value unless absolutely necessary! '
+         'See sla_probe_hosts and sla_list_safe_domain commands '
+         'for more details on SLA.' % HostMaintenance.SLA_UPTIME_DURATION_LIMIT)
+@app.command_option('--override_reason', dest='reason', default=None,
+    help='Reason for overriding default SLA values.')
 @app.command_option(FILENAME_OPTION)
 @app.command_option(HOSTS_OPTION)
 @app.command_option(GROUPING_OPTION)
@@ -67,6 +79,9 @@ def perform_maintenance_hosts(cluster):
   """usage: perform_maintenance_hosts {--filename=filename | --hosts=hosts}
                                       [--post_drain_script=path]
                                       [--grouping=function]
+                                      [--override_percentage=percentage]
+                                      [--override_duration=duration]
+                                      [--override_reason=reason]
                                       cluster
 
   Asks the scheduler to remove any running tasks from the machine and remove it
@@ -74,21 +89,27 @@ def perform_maintenance_hosts(cluster):
   to service.
   """
   options = app.get_options()
-  drainable_hosts = parse_hosts(options.filename, options.hosts)
+  drainable_hosts = parse_hostnames(options.filename, options.hosts)
   get_grouping_or_die(options.grouping)
 
-  if options.post_drain_script:
-    if not os.path.exists(options.post_drain_script):
-      die("No such file: %s" % options.post_drain_script)
-    cmd = os.path.abspath(options.post_drain_script)
-    drained_callback = lambda host: subprocess.Popen([cmd, host])
-  else:
-    drained_callback = None
+  has_override = bool(options.percentage) or bool(options.duration) or bool(options.reason)
+  all_overrides = bool(options.percentage) and bool(options.duration) and bool(options.reason)
+  if has_override != all_overrides:
+    die('All --override_* options are required when attempting to override default SLA values.')
+
+  percentage = parse_sla_percentage(options.percentage) if options.percentage else None
+  duration = parse_time(options.duration) if options.duration else None
+  if options.reason:
+    log_admin_message(logging.WARNING, options.reason)
+
+  drained_callback = parse_script(options.post_drain_script)
 
   HostMaintenance(CLUSTERS[cluster], options.verbosity).perform_maintenance(
       drainable_hosts,
+      grouping_function=options.grouping,
       callback=drained_callback,
-      grouping_function=options.grouping)
+      percentage=percentage,
+      duration=duration)
 
 
 @app.command
@@ -102,7 +123,7 @@ def host_maintenance_status(cluster):
   Check on the schedulers maintenance status for a list of hosts in the cluster.
   """
   options = app.get_options()
-  checkable_hosts = parse_hosts(options.filename, options.hosts)
+  checkable_hosts = parse_hostnames(options.filename, options.hosts)
   statuses = HostMaintenance(CLUSTERS[cluster], options.verbosity).check_status(checkable_hosts)
   for pair in statuses:
     log.info("%s is in state: %s" % pair)

http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/ea68ade1/src/test/python/apache/aurora/admin/test_host_maintenance.py
----------------------------------------------------------------------
diff --git a/src/test/python/apache/aurora/admin/test_host_maintenance.py b/src/test/python/apache/aurora/admin/test_host_maintenance.py
index 0341d35..8abce0e 100644
--- a/src/test/python/apache/aurora/admin/test_host_maintenance.py
+++ b/src/test/python/apache/aurora/admin/test_host_maintenance.py
@@ -144,12 +144,16 @@ class TestHostMaintenance(unittest.TestCase):
     spec=HostMaintenance._drain_hosts)
   @mock.patch("apache.aurora.admin.host_maintenance.HostMaintenance.start_maintenance",
     spec=HostMaintenance.start_maintenance)
-  def test_perform_maintenance(self, mock_start_maintenance, mock_drain_hosts,
-      mock_operate_on_hosts, mock_complete_maintenance):
+  @mock.patch("apache.aurora.admin.host_maintenance.HostMaintenance._check_sla",
+    spec=HostMaintenance._check_sla)
+  def test_perform_maintenance(self, mock_check_sla, mock_start_maintenance,
+      mock_drain_hosts, mock_operate_on_hosts, mock_complete_maintenance):
     mock_callback = mock.Mock()
+    mock_check_sla.return_value = True
     maintenance = HostMaintenance(DEFAULT_CLUSTER, 'quiet')
     maintenance.perform_maintenance(TEST_HOSTNAMES, callback=mock_callback)
     mock_start_maintenance.assert_called_once_with(TEST_HOSTNAMES)
+    assert mock_check_sla.call_count == 3
     assert mock_drain_hosts.call_count == 3
     assert mock_drain_hosts.call_args_list == [
         mock.call(Hosts(set([hostname]))) for hostname in TEST_HOSTNAMES]
@@ -171,13 +175,13 @@ class TestHostMaintenance(unittest.TestCase):
         ]))
     ))
     maintenance = HostMaintenance(DEFAULT_CLUSTER, 'quiet')
-    statuses = maintenance.check_status(TEST_HOSTNAMES)
+    result = maintenance.check_status(TEST_HOSTNAMES)
     mock_maintenance_status.assert_called_once_with(Hosts(set(TEST_HOSTNAMES)))
-    assert statuses == [
-        (TEST_HOSTNAMES[0], MaintenanceMode._VALUES_TO_NAMES[MaintenanceMode.DRAINING]),
-        (TEST_HOSTNAMES[1], MaintenanceMode._VALUES_TO_NAMES[MaintenanceMode.DRAINED]),
-        (TEST_HOSTNAMES[2], MaintenanceMode._VALUES_TO_NAMES[MaintenanceMode.NONE])
-    ]
+
+    assert len(result) == 3
+    assert (TEST_HOSTNAMES[0], MaintenanceMode._VALUES_TO_NAMES[MaintenanceMode.DRAINING])
in result
+    assert (TEST_HOSTNAMES[1], MaintenanceMode._VALUES_TO_NAMES[MaintenanceMode.DRAINED])
in result
+    assert (TEST_HOSTNAMES[2], MaintenanceMode._VALUES_TO_NAMES[MaintenanceMode.NONE]) in
result
 
 
 def test_default_grouping():

http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/ea68ade1/src/test/python/apache/aurora/client/commands/test_admin_sla.py
----------------------------------------------------------------------
diff --git a/src/test/python/apache/aurora/client/commands/test_admin_sla.py b/src/test/python/apache/aurora/client/commands/test_admin_sla.py
index da8015f..be380df 100644
--- a/src/test/python/apache/aurora/client/commands/test_admin_sla.py
+++ b/src/test/python/apache/aurora/client/commands/test_admin_sla.py
@@ -324,7 +324,7 @@ class TestAdminSlaProbeHostsCommand(AuroraClientCommandTest):
     """Tests successful execution of the sla_probe_hosts command with host list."""
     hosts = ['h0', 'h1']
     mock_options = self.setup_mock_options(hosts=','.join(hosts))
-    mock_vector = self.create_mock_vector(self.create_probe_hosts(2, 80, True, 0))
+    mock_vector = self.create_mock_probe_hosts_vector(self.create_probe_hosts(2, 80, True,
0))
     with contextlib.nested(
         patch('apache.aurora.client.commands.admin.AuroraClientAPI',
             new=Mock(spec=AuroraClientAPI)),
@@ -350,7 +350,7 @@ class TestAdminSlaProbeHostsCommand(AuroraClientCommandTest):
 
   def test_probe_hosts_with_file(self):
     """Tests successful execution of the sla_probe_hosts command with host filename."""
-    mock_vector = self.create_mock_vector(self.create_probe_hosts(1, 80, False, None))
+    mock_vector = self.create_mock_probe_hosts_vector(self.create_probe_hosts(1, 80, False,
None))
     with temporary_file() as fp:
       fp.write('h0')
       fp.flush()

http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/ea68ade1/src/test/python/apache/aurora/client/commands/test_maintenance.py
----------------------------------------------------------------------
diff --git a/src/test/python/apache/aurora/client/commands/test_maintenance.py b/src/test/python/apache/aurora/client/commands/test_maintenance.py
index 827bd7f..642c235 100644
--- a/src/test/python/apache/aurora/client/commands/test_maintenance.py
+++ b/src/test/python/apache/aurora/client/commands/test_maintenance.py
@@ -45,6 +45,9 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
     mock_options.cluster = self.TEST_CLUSTER
     mock_options.verbosity = False
     mock_options.disable_all_hooks = False
+    mock_options.percentage = None
+    mock_options.duration = None
+    mock_options.reason = None
     return mock_options
 
   def create_host_statuses(self, maintenance_mode):
@@ -88,10 +91,7 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
     with contextlib.nested(
         patch('apache.aurora.client.api.SchedulerProxy', return_value=mock_scheduler_proxy),
         patch('apache.aurora.client.commands.maintenance.CLUSTERS', new=self.TEST_CLUSTERS),
-        patch('twitter.common.app.get_options', return_value=mock_options)) as (
-            mock_scheduler_proxy_class,
-            mock_clusters_maintenancepatch,
-            options):
+        patch('twitter.common.app.get_options', return_value=mock_options)):
       start_maintenance_hosts([self.TEST_CLUSTER])
 
       mock_scheduler_proxy.startMaintenance.assert_called_with(Hosts(set(self.HOSTNAMES)))
@@ -104,10 +104,7 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
     with contextlib.nested(
         patch('apache.aurora.client.api.SchedulerProxy', return_value=mock_scheduler_proxy),
         patch('apache.aurora.client.commands.maintenance.CLUSTERS', new=self.TEST_CLUSTERS),
-        patch('twitter.common.app.get_options', return_value=mock_options)) as (
-            mock_scheduler_proxy_class,
-            mock_clusters_maintenancepatch,
-            options):
+        patch('twitter.common.app.get_options', return_value=mock_options)):
       end_maintenance_hosts([self.TEST_CLUSTER])
 
       mock_scheduler_proxy.endMaintenance.assert_called_with(Hosts(set(self.HOSTNAMES)))
@@ -128,14 +125,18 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
     mock_scheduler_proxy.maintenanceStatus.side_effect = host_status_results
     mock_scheduler_proxy.startMaintenance.return_value = self.create_start_maintenance_result()
     mock_scheduler_proxy.drainHosts.return_value = self.create_start_maintenance_result()
+    mock_vector = self.create_mock_probe_hosts_vector(self.create_probe_hosts(1, 95, True,
None))
 
     with contextlib.nested(
         patch('time.sleep'),
         patch('apache.aurora.client.api.SchedulerProxy', return_value=mock_scheduler_proxy),
+        patch('apache.aurora.client.api.sla.Sla.get_domain_uptime_vector',
+              return_value=mock_vector),
         patch('apache.aurora.client.commands.maintenance.CLUSTERS', new=self.TEST_CLUSTERS),
         patch('twitter.common.app.get_options', return_value=mock_options)) as (
             mock_sleep,
             mock_scheduler_proxy_class,
+            mock_vector_class,
             mock_clusters_maintenancepatch,
             options):
       perform_maintenance_hosts([self.TEST_CLUSTER])
@@ -147,6 +148,82 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
       assert mock_scheduler_proxy.drainHosts.call_count == 3
       assert mock_scheduler_proxy.endMaintenance.call_count == 3
 
+  def test_perform_maintenance_hosts_failed_default_sla(self):
+    mock_options = self.make_mock_options()
+    mock_options.post_drain_script = None
+    mock_options.grouping = 'by_host'
+
+    def host_status_results(hostnames):
+      if isinstance(hostnames, Hosts):
+        return self.create_drained_status_result(hostnames)
+      return self.create_maintenance_status_result()
+
+    mock_api, mock_scheduler_proxy = self.create_mock_api()
+    mock_scheduler_proxy.endMaintenance.return_value = self.create_end_maintenance_result()
+    mock_scheduler_proxy.maintenanceStatus.side_effect = host_status_results
+    mock_scheduler_proxy.startMaintenance.return_value = self.create_start_maintenance_result()
+    mock_scheduler_proxy.drainHosts.return_value = self.create_start_maintenance_result()
+    mock_vector = self.create_mock_probe_hosts_vector(self.create_probe_hosts(1, 95, False,
None))
+
+    with contextlib.nested(
+        patch('time.sleep'),
+        patch('apache.aurora.client.api.SchedulerProxy', return_value=mock_scheduler_proxy),
+        patch('apache.aurora.client.api.sla.Sla.get_domain_uptime_vector',
+              return_value=mock_vector),
+        patch('apache.aurora.client.commands.maintenance.CLUSTERS', new=self.TEST_CLUSTERS),
+        patch('twitter.common.app.get_options', return_value=mock_options)):
+      perform_maintenance_hosts([self.TEST_CLUSTER])
+
+      mock_scheduler_proxy.startMaintenance.assert_called_with(Hosts(set(self.HOSTNAMES)))
+      assert mock_scheduler_proxy.endMaintenance.call_count == len(self.HOSTNAMES)
+
+  def test_perform_maintenance_hosts_failed_custom_sla(self):
+    mock_options = self.make_mock_options()
+    mock_options.post_drain_script = None
+    mock_options.grouping = 'by_host'
+    mock_options.percentage = 50
+    mock_options.duration = '10m'
+    mock_options.reason = 'Test overrides'
+
+    def host_status_results(hostnames):
+      if isinstance(hostnames, Hosts):
+        return self.create_drained_status_result(hostnames)
+      return self.create_maintenance_status_result()
+
+    mock_api, mock_scheduler_proxy = self.create_mock_api()
+    mock_scheduler_proxy.endMaintenance.return_value = self.create_end_maintenance_result()
+    mock_scheduler_proxy.maintenanceStatus.side_effect = host_status_results
+    mock_scheduler_proxy.startMaintenance.return_value = self.create_start_maintenance_result()
+    mock_scheduler_proxy.drainHosts.return_value = self.create_start_maintenance_result()
+    mock_vector = self.create_mock_probe_hosts_vector(self.create_probe_hosts(1, 95, False,
None))
+
+    with contextlib.nested(
+        patch('time.sleep'),
+        patch('apache.aurora.client.api.SchedulerProxy', return_value=mock_scheduler_proxy),
+        patch('apache.aurora.client.api.sla.Sla.get_domain_uptime_vector',
+              return_value=mock_vector),
+        patch('apache.aurora.client.commands.maintenance.CLUSTERS', new=self.TEST_CLUSTERS),
+        patch('twitter.common.app.get_options', return_value=mock_options)):
+      perform_maintenance_hosts([self.TEST_CLUSTER])
+
+      mock_scheduler_proxy.startMaintenance.assert_called_with(Hosts(set(self.HOSTNAMES)))
+      assert mock_scheduler_proxy.endMaintenance.call_count == len(self.HOSTNAMES)
+
+  def test_perform_maintenance_hosts_reason_missing(self):
+    mock_options = self.make_mock_options()
+    mock_options.grouping = 'by_host'
+    mock_options.percentage = 50
+    mock_options.duration = '10m'
+
+    with contextlib.nested(
+        patch('twitter.common.app.get_options', return_value=mock_options)):
+      try:
+        perform_maintenance_hosts([self.TEST_CLUSTER])
+      except SystemExit:
+        pass
+      else:
+        assert 'Expected error is not raised.'
+
   def test_host_maintenance_status(self):
     mock_options = self.make_mock_options()
     mock_api, mock_scheduler_proxy = self.create_mock_api()
@@ -154,10 +231,7 @@ class TestMaintenanceCommands(AuroraClientCommandTest):
     with contextlib.nested(
         patch('apache.aurora.client.api.SchedulerProxy', return_value=mock_scheduler_proxy),
         patch('apache.aurora.client.commands.maintenance.CLUSTERS', new=self.TEST_CLUSTERS),
-        patch('twitter.common.app.get_options', return_value=mock_options)) as (
-            mock_scheduler_proxy_class,
-            mock_clusters_maintenancepatch,
-            options):
+        patch('twitter.common.app.get_options', return_value=mock_options)):
       host_maintenance_status([self.TEST_CLUSTER])
 
       mock_scheduler_proxy.maintenanceStatus.assert_called_with(Hosts(set(self.HOSTNAMES)))

http://git-wip-us.apache.org/repos/asf/incubator-aurora/blob/ea68ade1/src/test/python/apache/aurora/client/commands/util.py
----------------------------------------------------------------------
diff --git a/src/test/python/apache/aurora/client/commands/util.py b/src/test/python/apache/aurora/client/commands/util.py
index 8478417..b1822f2 100644
--- a/src/test/python/apache/aurora/client/commands/util.py
+++ b/src/test/python/apache/aurora/client/commands/util.py
@@ -13,10 +13,13 @@
 #
 
 import unittest
+from collections import defaultdict
 
 from mock import Mock
 
+from apache.aurora.client.api.sla import DomainUpTimeSlaVector, JobUpTimeDetails
 from apache.aurora.client.hooks.hooked_api import HookedAuroraClientAPI
+from apache.aurora.common.aurora_job_key import AuroraJobKey
 from apache.aurora.common.cluster import Cluster
 from apache.aurora.common.clusters import Clusters
 
@@ -126,3 +129,18 @@ jobs = [HELLO_WORLD]
   def get_invalid_config(cls, bad_clause):
     return cls.get_test_config(cls.TEST_CLUSTER, cls.TEST_ROLE, cls.TEST_ENV, cls.TEST_JOB,
         bad_clause)
+
+  @classmethod
+  def create_mock_probe_hosts_vector(cls, result):
+    mock_vector = Mock(spec=DomainUpTimeSlaVector)
+    mock_vector.probe_hosts.return_value = result
+    return mock_vector
+
+  @classmethod
+  def create_probe_hosts(cls, num_hosts, predicted, safe, safe_in):
+    hosts = defaultdict(list)
+    for i in range(num_hosts):
+      host_name = 'h%s' % i
+      job = AuroraJobKey.from_path('west/role/env/job%s' % i)
+      hosts[host_name].append(JobUpTimeDetails(job, predicted, safe, safe_in))
+    return [hosts]


Mime
View raw message