hawq-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From r...@apache.org
Subject incubator-hawq git commit: HAWQ-472. Add checks to HAWQ standby start up
Date Thu, 03 Mar 2016 02:40:07 GMT
Repository: incubator-hawq
Updated Branches:
  refs/heads/master 74297b676 -> 419d89e7b


HAWQ-472. Add checks to HAWQ standby start up


Project: http://git-wip-us.apache.org/repos/asf/incubator-hawq/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hawq/commit/419d89e7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hawq/tree/419d89e7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hawq/diff/419d89e7

Branch: refs/heads/master
Commit: 419d89e7b2bb2e61180391cc1fbca1f62d30c158
Parents: 74297b6
Author: rlei <rlei@pivotal.io>
Authored: Mon Feb 29 16:38:58 2016 +0800
Committer: rlei <rlei@pivotal.io>
Committed: Thu Mar 3 10:38:49 2016 +0800

----------------------------------------------------------------------
 tools/bin/hawq_ctl             |  10 +-
 tools/sbin/hawqstandbywatch.py | 229 ++++++++++++++++++++++++++++++++++++
 2 files changed, 235 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/419d89e7/tools/bin/hawq_ctl
----------------------------------------------------------------------
diff --git a/tools/bin/hawq_ctl b/tools/bin/hawq_ctl
index 9a38bf3..a3b06b5 100755
--- a/tools/bin/hawq_ctl
+++ b/tools/bin/hawq_ctl
@@ -237,7 +237,7 @@ class HawqInit:
             logger.info("running standby host is %s" % running_standby_host)
             signal.signal(signal.SIGINT,signal.SIG_IGN)
             logger.info("Stop HAWQ cluster")
-            cmd = "%s; hawq stop master -a -q" % source_hawq_env
+            cmd = "%s; hawq stop master -a -M fast -q" % source_hawq_env
             check_return_code(local_ssh(cmd, logger), logger, "Stop HAWQ master failed, exit")
             cmd = "%s; hawq stop allsegments -a -q" % source_hawq_env
             check_return_code(local_ssh(cmd, logger), logger, "Stop HAWQ segments failed,
exit")
@@ -259,11 +259,11 @@ class HawqInit:
                 check_return_code(local_ssh(cmd, logger), logger, \
                                   "Update catalog failed, exit", "Catalog updated successfully.")
                 logger.info("Stop HAWQ master")
-                cmd = "%s; hawq stop master -a" % source_hawq_env
+                cmd = "%s; hawq stop master -a -M fast" % source_hawq_env
                 check_return_code(local_ssh(cmd, logger), logger, "Stop hawq master failed,
exit")
             except DatabaseError, ex:
                 logger.error("Failed to connect to database, this script can only be run
when the database is up")
-                cmd = "%s; hawq stop master -a" % source_hawq_env
+                cmd = "%s; hawq stop master -a -M fast" % source_hawq_env
                 check_return_code(local_ssh(cmd, logger), logger, "Stop hawq master failed,
exit")
             remove_property_xml("hawq_standby_address_host", "%s/etc/hawq-site.xml" % self.GPHOME)
             host_list = parse_hosts_file(self.GPHOME)
@@ -312,7 +312,7 @@ class HawqInit:
 
     def _resync_standby(self):
         logger.info("Re-sync standby")
-        cmd = "%s; hawq stop master -a" % source_hawq_env
+        cmd = "%s; hawq stop master -a;" % source_hawq_env
         check_return_code(local_ssh(cmd, logger), logger, "Stop hawq cluster failed, exit")
         cmd = "cd %s; %s; %s/bin/lib/pysync.py -x gpperfmon/data -x pg_log -x db_dumps %s
%s:%s" % \
                  (self.master_data_directory, source_hawq_env,  self.GPHOME, self.master_data_directory,
@@ -540,6 +540,8 @@ class HawqStart:
     def start_standby(self):
         cmd = self._start_standby_cmd()
         result = remote_ssh(cmd, self.standby_host_name, self.user)
+        cmd = "%s; %s/sbin/hawqstandbywatch.py %s debug" % (source_hawq_env, self.GPHOME,
self.master_data_directory)
+        result = remote_ssh(cmd, self.standby_host_name, self.user)
         return result
 
     def _start_segment_cmd(self):

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/419d89e7/tools/sbin/hawqstandbywatch.py
----------------------------------------------------------------------
diff --git a/tools/sbin/hawqstandbywatch.py b/tools/sbin/hawqstandbywatch.py
new file mode 100755
index 0000000..82cf699
--- /dev/null
+++ b/tools/sbin/hawqstandbywatch.py
@@ -0,0 +1,229 @@
+#!/usr/bin/env python
+# Line too long - pylint: disable=C0301
+# Invalid name  - pylint: disable=C0103
+
+"""
+  gpstandbywatch.py
+  Copyright (c) EMC/Greenplum Inc 2011. All Rights Reserved.
+
+  Check actual contents and process state of syncmaster
+  in order to properly return accurate information back to
+  gpinitstandby via gpstart.
+"""
+
+import os
+import sys
+import glob
+import time
+
+from gppylib.gplog    import setup_tool_logging, get_default_logger
+from gppylib.commands import gp, unix
+
+
+def matching_files(pglogdir, ignore=None, setlimit=False):
+    """
+    Generate a series of file names corresponding to files
+    in 'pglogdir' which are not in the specified 'ignore' map.
+
+    Note that startup.log is always returned if present.
+
+    If 'setlimit' is not false, files whose modification time
+    exceeds the ctime of startup.log will also be ignored.
+    """
+    mlimit = None
+    pattern = os.path.join(pglogdir, 'startup.log')
+    for path in glob.glob(pattern):
+        if setlimit:
+            mlimit = os.stat(path).st_ctime
+        yield path
+
+    home_directory = os.path.expanduser("~")
+    pattern = os.path.join('%s/hawqAdminLogs/' % home_directory, 'startup.log')
+    for path in glob.glob(pattern):
+        if setlimit:
+            mlimit = os.stat(path).st_ctime
+        yield path
+
+    pattern = os.path.join(pglogdir, '*.csv')
+    for path in glob.glob(pattern):
+        if ignore is not None and path in ignore:
+            continue
+        if mlimit is not None and os.stat(path).st_mtime >= mlimit:
+            continue
+        yield path
+
+
+def updated_files(pglogdir, ignore, prev):
+    """
+    Generate a series of (time, path) tuples corresponding to files
+    in 'pglogdir' and not in 'ignore' which were also modified
+    after 'prev' (or all files if 'prev' is None).
+    """
+    for path in matching_files(pglogdir, ignore):
+        ts = os.stat(path).st_mtime
+        if prev is None or prev < ts:
+            yield (ts, path)
+
+
+def updated_handles(pglogdir, ignore, prev, handles):
+    """
+    Generate a series of (time, handle) tuples corresponding to files
+    in 'pglogdir' and not in 'ignore' modified after 'prev'
+    (or all files if 'prev' is None).
+    """
+    for ts, path in updated_files(pglogdir, ignore, prev):
+        h = handles.get(path, None)
+        if h is None:
+            h = open(path, 'r')
+            handles[path] = h
+        yield (ts, h)
+
+
+
+class SyncmasterWatcher:
+    """
+    Watch changes to files in the pg_log directory recorded by the gpsyncmaster.
+    """
+
+    def __init__(self, datadir):
+        """
+        Build a map containing the existing contents of the pg_log
+        directory so that we can avoid getting confused by them
+        after we start the syncmaster.
+        """
+        self.datadir         = datadir
+        self.pglogdir        = os.path.join(self.datadir, 'pg_log')
+
+        # note use of setlimit=True here to prevent any files created
+        # after startup.log from becoming ignored.
+        self.ignore          = {}
+        for path in matching_files( self.pglogdir, setlimit=True ):
+            self.ignore[path] = True
+
+        self.handles         = {}
+        self.maxlines        = 1000
+        self.timelimit       = 5
+        self.delay           = 0.1
+
+
+    def tail_briefly(self):
+        """
+        Generate lines recently added to log files in the pg_log directory
+        updated after our __init__ constructor was called.
+        """
+        start   = time.time()                       # starting time
+        elapsed = 0                                 # time elapsed so far
+        count   = 0                                 # number of lines we've seen
+        tp      = None
+
+        # until we're out of time or have returned enough lines
+        while elapsed < self.timelimit and count < self.maxlines:
+
+            # for each file modified since we last checked
+            tn = None
+            for ts, h in updated_handles(self.pglogdir, self.ignore, tp, self.handles):
+
+                # track the last file modification time
+                if tn is None or tn < ts:
+                    tn = ts
+
+                # yield the new lines to the caller
+                while count < self.maxlines:
+                    line = h.readline()
+                    if not line:
+                        break
+                    yield line
+                    count += 1
+
+            # update the elapsed time
+            elapsed = time.time() - start
+
+            # if any new lines, update prev and keep checking for more
+            if tn is not None:
+                tp = tn
+                continue
+
+            # if we get here it means none of the files were updated in
+            # our last iteration. sleep a moment before checking for
+            # more updates
+            time.sleep(self.delay)
+
+
+
+    def monitor_logs(self):
+        """
+        Read the syncmaster log files for a few seconds, looking for
+        potential problems.
+
+        Returns 0 if no problems were seen or or 1 if the startup log
+        contained an error or if the gpsyncmaster process exited before
+        we were done watching.
+        """
+        logger.info("Monitoring logs")
+
+        # now scan some of the syncmaster output for a moment
+        for line in self.tail_briefly():
+
+            if line.startswith('Traceback'):        # gpsyncmaster traceback recorded
+                logger.warning(line)
+                return 1
+
+            # MPP-13212 - since the syncmaster reports rejected client connections
+            #   as 'FATAL' errors, the presence of a 'FATAL' error need not indicate
+            #   a problem in the syncmaster so we comment out the following logic:
+            #
+            # if line.find('FATAL') >= 0:             # fatal error recorded
+            #     logger.warning(line)
+            #     return 1
+            #
+            # This is especially important for health monitoring clients which may
+            # rely on the difference between a rejected connection and a TCP failure.
+
+            if line.find('could not bind IPv4 socket') >= 0: # syncmaster used IPv6 by
mistake
+                logger.warning(line)
+                return 1
+
+            if line.find('QDSYNC: scan forward') >= 0: # syncmaster appears to be working
+                logger.info(line)
+                break
+
+        logger.info("checking if syncmaster is running")
+        pid = gp.getSyncmasterPID('localhost', self.datadir)
+        if not pid > 0:
+            logger.warning("syncmaster not running")
+            return 1
+
+        # syncmaster is running and there are no obvious errors in the log
+        logger.info("syncmaster appears ok, pid %s" % pid)
+        return 0
+
+
+    def close(self):
+        """
+        Closes all handles to the logs we're watching.
+        """
+        for h in self.handles.values():
+            h.close()
+        self.handles = {}
+
+
+
+if __name__ == '__main__':
+
+    # setup gpAdminLogs logging
+    execname = os.path.split(sys.argv[0])[-1]
+    hostname = unix.getLocalHostname()
+    username = unix.getUserName()
+    setup_tool_logging(execname, hostname, username)
+    logger = get_default_logger()
+
+    # watch syncmaster logs
+    if len(sys.argv) > 2 and sys.argv[2] == 'debug':
+        print "Checking standby master status"
+    watcher = SyncmasterWatcher( sys.argv[1] )
+    rc = watcher.monitor_logs()
+    watcher.close()
+
+    # report final status
+    # logger.info("exiting with %s" % rc)
+    sys.exit( rc )


Mime
View raw message