hawq-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From bhuvnesh2...@apache.org
Subject incubator-hawq git commit: HAWQ-617. Add ignore-bad-hosts option.
Date Wed, 06 Apr 2016 21:34:34 GMT
Repository: incubator-hawq
Updated Branches:
  refs/heads/master 8bd10631e -> 19604066b


HAWQ-617. Add ignore-bad-hosts option.


Project: http://git-wip-us.apache.org/repos/asf/incubator-hawq/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hawq/commit/19604066
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hawq/tree/19604066
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hawq/diff/19604066

Branch: refs/heads/master
Commit: 19604066b134bac8b7226a3abdb55411c181a4c4
Parents: 8bd1063
Author: Bhuvnesh Chaudhary <bchaudhary@pivotal.io>
Authored: Wed Apr 6 14:35:30 2016 -0700
Committer: Bhuvnesh Chaudhary <bchaudhary@pivotal.io>
Committed: Wed Apr 6 14:35:30 2016 -0700

----------------------------------------------------------------------
 tools/bin/gppylib/util/ssh_utils.py | 23 +++++++++++++++++++
 tools/bin/gpscp                     | 36 ++++++++++++++++++-----------
 tools/bin/hawq_ctl                  | 36 +++++++++++++++++++++--------
 tools/bin/hawqconfig                | 12 ++++++----
 tools/bin/hawqpylib/HAWQ_HELP.py    |  1 +
 tools/bin/hawqpylib/hawqlib.py      | 39 ++++++++++++++++++++++++++++++++
 tools/doc/gpscp_help                |  7 ++++++
 7 files changed, 126 insertions(+), 28 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/19604066/tools/bin/gppylib/util/ssh_utils.py
----------------------------------------------------------------------
diff --git a/tools/bin/gppylib/util/ssh_utils.py b/tools/bin/gppylib/util/ssh_utils.py
index 3194e11..853c0f5 100644
--- a/tools/bin/gppylib/util/ssh_utils.py
+++ b/tools/bin/gppylib/util/ssh_utils.py
@@ -160,6 +160,29 @@ class HostList():
         
         return self.list
 
+    def removeBadHosts(self):
+        ''' Update list of host to include only the host on which SSH was successful'''
+
+        pool = WorkerPool()
+
+        for h in self.list:
+            cmd = Echo('ssh test', '', ctxt=REMOTE, remoteHost=h)
+            pool.addCommand(cmd)
+
+        pool.join()
+        pool.haltWork()
+
+        bad_hosts = []
+        working_hosts = []
+        for cmd in pool.getCompletedItems():
+            if not cmd.get_results().wasSuccessful():
+                bad_hosts.append(cmd.remoteHost)
+            else:
+                working_hosts.append(cmd.remoteHost)
+
+        self.list = working_hosts[:]
+        return bad_hosts
+
 # Session is a command session, derived from a base class cmd.Cmd
 class Session(cmd.Cmd):
     '''Implements a list of open ssh sessions ready to execute commands'''

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/19604066/tools/bin/gpscp
----------------------------------------------------------------------
diff --git a/tools/bin/gpscp b/tools/bin/gpscp
index d00f15d..c02d677 100755
--- a/tools/bin/gpscp
+++ b/tools/bin/gpscp
@@ -64,6 +64,7 @@ class Global:
     opt['-f'] = None
     opt['-J'] = '=:'
     opt['-r'] = False
+    opt['--ignore-bad-hosts'] = False
     filePath = []
 
 GV = Global()
@@ -86,18 +87,19 @@ def print_version():
 #############
 def parseCommandLine():
     try:
-        (options, args) = getopt.getopt(sys.argv[1:], '?vrJ:p:u:h:f:', ['version'])
+        (options, args) = getopt.getopt(sys.argv[1:], '?vrJ:p:u:h:f:', ['version', 'ignore-bad-hosts'])
     except Exception, e:
         usage('[ERROR] ' + str(e))
 
     for (switch, val) in options:
-	if (switch == '-?'): 	      usage(0)
-	elif (switch == '-v'):        GV.opt[switch] = True
-	elif (switch == '-f'):        GV.opt[switch] = val
-	elif (switch == '-h'):        GV.opt[switch].append(val)
-        elif (switch == '-J'):        GV.opt[switch] = val + ':'
-        elif (switch == '-r'):        GV.opt[switch] = True
-        elif (switch == '--version'): print_version()
+        if (switch == '-?'): 	                              usage(0)
+        elif (switch == '-v'):                              GV.opt[switch] = True
+        elif (switch == '-f'):                              GV.opt[switch] = val
+        elif (switch == '-h'):                              GV.opt[switch].append(val)
+        elif (switch == '-J'):                              GV.opt[switch] = val + ':'
+        elif (switch == '-r'):                              GV.opt[switch] = True
+        elif (switch == '--version'):                       print_version()
+        elif (switch == '--ignore-bad-hosts'):          GV.opt[switch] = True
 
     hf = (len(GV.opt['-h']) and 1 or 0) + (GV.opt['-f'] and 1 or 0)
     if hf != 1:
@@ -131,15 +133,23 @@ try:
     if GV.opt['-f']:
         hostlist.parseFile(GV.opt['-f'])
 
-    try:
-        hostlist.checkSSH()
-    except ssh_utils.SSHError, e:
-        sys.exit('[ERROR] ' + str(e))
+    if GV.opt['--ignore-bad-hosts']:
+        original_hostlist = hostlist.list
+        bad_hosts = hostlist.removeBadHosts()
+        if len(bad_hosts) == len(original_hostlist):
+            sys.exit('[ERROR]: Unable to SSH to any of the hosts {0}'.format(original_hostlist))
+        if len(bad_hosts) > 0:
+            print "[WARN]: Skipping syncing configuration file on hosts {0}, as ssh test
failed".format(bad_hosts)
+    else:
+        try:
+            hostlist.checkSSH()
+        except ssh_utils.SSHError, e:
+            sys.exit('[ERROR] ' + str(e))
 
     GV.opt['-h'] = hostlist.filterMultiHomedHosts()
+
     if len(GV.opt['-h']) == 0:
         usage('Error: missing hosts in -h and/or -f arguments')
-
     scp = 'scp -o "BatchMode yes" -o "StrictHostKeyChecking no"'
     if GV.opt['-r']:  scp += ' -r'
     

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/19604066/tools/bin/hawq_ctl
----------------------------------------------------------------------
diff --git a/tools/bin/hawq_ctl b/tools/bin/hawq_ctl
index 7937ac6..a4e0c3c 100755
--- a/tools/bin/hawq_ctl
+++ b/tools/bin/hawq_ctl
@@ -493,6 +493,7 @@ class HawqStart:
         self.masteronly = opts.masteronly 
         self.special_mode = opts.special_mode
         self.restrict =  opts.restrict
+        self.ignore_bad_hosts = opts.ignore_bad_hosts
 
         self._get_config()
 
@@ -682,13 +683,24 @@ class HawqStart:
 
     def _start_all_segments(self):
         logger.info("Start all the segments in hawq cluster")
-        segment_cmd_str = self._start_segment_cmd()
         logger.info("Start segments in list: %s" % self.host_list)
-        work_list = []
+        bad_hosts = []
+        working_hosts = self.host_list
+        if self.ignore_bad_hosts:
+            working_hosts, bad_hosts = exclude_bad_hosts(self.host_list)
+            if len(bad_hosts) == len(self.host_list):
+                logger.error("Unable to SSH on any of the hosts, skipping segment start operation")
+                return
+            if len(bad_hosts) > 0:
+                logger.warning("Skipping starting segments in the list {0}, SSH test failed".format(bad_hosts))
+                self.hosts_count_number -= len(bad_hosts)
+
+        segment_cmd_str = self._start_segment_cmd()
         q = Queue.Queue()
-        for host in self.host_list:
+        work_list = []
+        for host in working_hosts:
             work_list.append({"func":remote_ssh,"args":(segment_cmd_str, host, self.user,
q)})
-        work_list.append({"func":check_progress,"args":(q, self.hosts_count_number, 'start',
0, self.quiet)})
+        work_list.append({"func":check_progress,"args":(q, self.hosts_count_number, 'start',
len(bad_hosts), self.quiet)})
         node_init = HawqCommands(name = 'HAWQ', action_name = 'start', logger = logger)
         node_init.get_function_list(work_list)
         node_init.start()
@@ -699,7 +711,6 @@ class HawqStart:
             logger.info("Segments started successfully")
         return node_init.return_flag
 
-
     def run(self):
         if self.node_type == "master":
             check_return_code(self.start_master(), logger, \
@@ -1145,7 +1156,7 @@ def hawq_activate_standby(opts, hawq_dict):
             logger.error("Stop master failed, try again with immediate mode")
             cmd = "%s; hawq stop master -a -M immediate -q;" % source_hawq_env
             return_result = remote_ssh(cmd, old_master_host_name, '')
-            if return_resutl != 0:
+            if return_result != 0:
                 logger.error("Stop master failed, abort")
                 logger.error("Please manually bring hawq cluster down, then do activate standby
again")
                 sys.exit(1)
@@ -1175,13 +1186,13 @@ def hawq_activate_standby(opts, hawq_dict):
     
     # Set current standby host name as the new master host name in configuration.
     logger.info("Update master host name in hawq-site.xml")
-    cmd = "%s; hawq config -c hawq_master_address_host -v %s --skipvalidation -q" % \
-           (source_hawq_env, hawq_dict['hawq_standby_address_host'])
+    ignore_bad_hosts = '--ignore-bad-hosts' if opts.ignore_bad_hosts else ''
+    cmd = "%s; hawq config -c hawq_master_address_host -v %s --skipvalidation -q %s" % (source_hawq_env,
hawq_dict['hawq_standby_address_host'], ignore_bad_hosts)
     check_return_code(remote_ssh(cmd, old_standby_host_name, ''), logger, "Set hawq_master_address_host
failed")
 
     # Remove the old standby host configuration from hawq-site.xml.
     logger.info("Remove current standby from hawq-site.xml")
-    cmd = "%s; hawq config -r hawq_standby_address_host --skipvalidation -q" % source_hawq_env
+    cmd = "%s; hawq config -r hawq_standby_address_host --skipvalidation -q %s" % (source_hawq_env,
ignore_bad_hosts)
     check_return_code(remote_ssh(cmd, old_standby_host_name, ''), logger, "Remove hawq_standby_address_host
from configuration failed")
 
     cmd = '''echo "gp_persistent_repair_global_sequence = true" >> %s/%s''' % (hawq_dict['hawq_master_directory'],
'postgresql.conf')
@@ -1205,7 +1216,7 @@ def hawq_activate_standby(opts, hawq_dict):
     logger.info("Start hawq cluster")
     cmd = "%s; hawq start master" % source_hawq_env
     check_return_code(remote_ssh(cmd, new_master_host_name, ''), logger, "Start master failed")
-    cmd = "%s; hawq start allsegments" % source_hawq_env
+    cmd = "%s; hawq start allsegments %s" % (source_hawq_env, ignore_bad_hosts)
     check_return_code(remote_ssh(cmd, new_master_host_name, ''), logger, "Start all the segments
failed")
     cmd = '''sed -i "/gp_persistent_repair_global_sequence/d" %s/%s''' % (hawq_dict['hawq_master_directory'],
'postgresql.conf')
     check_return_code(remote_ssh(cmd, new_master_host_name, ''))
@@ -1279,6 +1290,10 @@ def create_parser():
     parser.add_option('-n', '--no-update', action='store_true',
                       dest='no_update', default=False,
                       help='Do not update system catalog tables.')
+    parser.add_option('-i', '--ignore-bad-hosts',
+                      dest='ignore_bad_hosts', action='store_true',
+                      default=False,
+                      help='Skips syncing configuration files on hosts on which SSH fails')
     parser.add_option("--bucket_number",
                       type="int",
                       dest="default_hash_table_bucket_number",
@@ -1319,6 +1334,7 @@ def create_parser():
                       dest="shared_buffers",
                       default="128000kB",
                       help="Sets the shared_buffers for formatting hawq database")
+
     (options, args) = parser.parse_args()
     if len(args) == 0:
         parser.print_help()

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/19604066/tools/bin/hawqconfig
----------------------------------------------------------------------
diff --git a/tools/bin/hawqconfig b/tools/bin/hawqconfig
index 6618bc3..4a1e3c7 100755
--- a/tools/bin/hawqconfig
+++ b/tools/bin/hawqconfig
@@ -39,6 +39,8 @@ def parseargs():
     parser.add_option('-l', '--list', action='store_true',
                       help="List all HAWQ Properties.")
     parser.add_option('--skipvalidation', action='store_true', default=False)
+    parser.add_option('--ignore-bad-hosts', action='store_true', default=False,
+                      help='Skips copying configuration files on host on which SSH fails')
     parser.add_option('-q', '--quiet', action='store_true', default=False)
     parser.add_option("-v", "--value",
                       dest="property_value",
@@ -174,12 +176,11 @@ def update_hawq_site(org_config_file, hawq_site, property_name, property_value):
         append_xml_property(org_config_file, property_name, property_value)
 
 
-def sync_hawq_site(config_dir, host_list):
+def sync_hawq_site(config_dir, host_list, ignore_bad_hosts):
     sync_host_str = ""
     for node in host_list:
         sync_host_str += " -h %s" % node
-
-    result = local_ssh("hawq scp %s %s/etc/hawq-site.xml =:%s/etc/" % (sync_host_str, config_dir,
config_dir))
+    result = local_ssh("hawq scp %s %s %s/etc/hawq-site.xml =:%s/etc/" % (sync_host_str,
ignore_bad_hosts, config_dir, config_dir))
     if result != 0:
         sys.exit("sync hawq-site.xml failed.")
 
@@ -196,6 +197,7 @@ if __name__ == '__main__':
     segment_list = parse_hosts_file(GPHOME)
     master_host = hawq_site.hawq_dict['hawq_master_address_host']
     host_list = segment_list + [master_host]
+    ignore_bad_hosts = '--ignore-bad-hosts' if options.ignore_bad_hosts else ''
     if 'hawq_standby_address_host' in hawq_site.hawq_dict:
         standby_host = hawq_site.hawq_dict['hawq_standby_address_host']
         if standby_host not in ('None', 'none', ''):
@@ -212,7 +214,7 @@ if __name__ == '__main__':
             check_property_valid(hawq_site, options.change)
 
         update_hawq_site(org_config_file, hawq_site, options.change, options.property_value)
-        sync_hawq_site(GPHOME, host_list)
+        sync_hawq_site(GPHOME, host_list, ignore_bad_hosts)
 
         if not options.quiet:
             latest_hawq_site = HawqXMLParser(GPHOME)
@@ -232,7 +234,7 @@ if __name__ == '__main__':
             print "Remove %s is not allowed" % options.remove
             sys.exit(1)
         remove_property_xml(options.remove, org_config_file, options.quiet)
-        sync_hawq_site(GPHOME, host_list)
+        sync_hawq_site(GPHOME, host_list, ignore_bad_hosts)
     else:
         print "Please input correct options"
         sys.exit(1)

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/19604066/tools/bin/hawqpylib/HAWQ_HELP.py
----------------------------------------------------------------------
diff --git a/tools/bin/hawqpylib/HAWQ_HELP.py b/tools/bin/hawqpylib/HAWQ_HELP.py
index 2c184ac..e0d901f 100755
--- a/tools/bin/hawqpylib/HAWQ_HELP.py
+++ b/tools/bin/hawqpylib/HAWQ_HELP.py
@@ -164,6 +164,7 @@ The "options" are:
    -v --verbose        Displays detailed status.
    -r --remove         HAWQ GUC name to be removed.
    --skipvalidation    Skip the system validation checks.
+   --ignore-bad-hosts  Skips copying configuration files on host on which SSH fails
 
 See 'hawq --help' for more information on other commands.
 """

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/19604066/tools/bin/hawqpylib/hawqlib.py
----------------------------------------------------------------------
diff --git a/tools/bin/hawqpylib/hawqlib.py b/tools/bin/hawqpylib/hawqlib.py
index 85354b4..79bcdae 100755
--- a/tools/bin/hawqpylib/hawqlib.py
+++ b/tools/bin/hawqpylib/hawqlib.py
@@ -24,6 +24,8 @@ from xml.dom import minidom
 from xml.etree.ElementTree import ElementTree
 import shutil
 from gppylib.db import dbconn
+from gppylib.commands.base import WorkerPool, REMOTE
+from gppylib.commands.unix import Echo
 import re
 
 
@@ -484,3 +486,40 @@ def get_hawq_hostname_all(master_port):
 
     hawq_host_array = {'master': {master_host: master_status}, 'standby': {standby_host:
standby_status}, 'segment': seg_host_list} 
     return hawq_host_array
+
+def get_host_status(hostlist):
+    """
+    Test if SSH command works on a host and return a dictionary
+    Return Ex: {host1: True, host2: False}
+    where True represents SSH command success and False represents failure
+    """
+    if not isinstance(hostlist, list):
+        raise Exception("Input parameter should be of type list")
+
+    pool = WorkerPool()
+
+    for host in hostlist:
+        cmd = Echo('ssh test', '', ctxt=REMOTE, remoteHost=host)
+        pool.addCommand(cmd)
+
+    pool.join()
+    pool.haltWork()
+
+    host_status_dict = {}
+    for cmd in pool.getCompletedItems():
+        if not cmd.get_results().wasSuccessful():
+            host_status_dict[cmd.remoteHost] = False
+        else:
+            host_status_dict[cmd.remoteHost] = True
+
+    return host_status_dict
+
+
+def exclude_bad_hosts(host_list):
+    """
+    Split Hosts on which SSH works vs node on which it fails
+    """
+    host_status_dict = get_host_status(host_list)
+    working_hosts = [host for host in host_status_dict.keys() if host_status_dict[host]]
+    bad_hosts = list(set(host_list) - set(working_hosts))
+    return working_hosts, bad_hosts

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/19604066/tools/doc/gpscp_help
----------------------------------------------------------------------
diff --git a/tools/doc/gpscp_help b/tools/doc/gpscp_help
index 4212288..aa5bd25 100755
--- a/tools/doc/gpscp_help
+++ b/tools/doc/gpscp_help
@@ -84,6 +84,13 @@ character is an equal sign (=).
 Optional. Reports additional messages in addition to the 
 SCP command output.
 
+--ignore-bad-hosts
+Ignore copying files to the hosts on which test SSH attempt failed
+and continue with the remaining. If test SSH failed, it indicates 
+that either the host is not working or there are issues while attempting
+to SSH on these host. Once the skipped hosts are brought back, ensure 
+that the required files are synced to them.
+
 <file_to_copy>
 
 Required. The file name (or absolute path) of a file that 


Mime
View raw message