hawq-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From odiache...@apache.org
Subject [02/13] incubator-hawq git commit: HAWQ-668. hawq check should be able to check yarn settings
Date Tue, 03 May 2016 00:33:18 GMT
HAWQ-668. hawq check should be able to check yarn settings


Project: http://git-wip-us.apache.org/repos/asf/incubator-hawq/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hawq/commit/e74109bf
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hawq/tree/e74109bf
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hawq/diff/e74109bf

Branch: refs/heads/HAWQ-703
Commit: e74109bf6645a24bbbd2ce37a32d01e981c670e5
Parents: 1469782
Author: rlei <rlei@pivotal.io>
Authored: Wed Apr 13 17:13:02 2016 +0800
Committer: rlei <rlei@pivotal.io>
Committed: Tue Apr 19 17:23:27 2016 +0800

----------------------------------------------------------------------
 src/backend/utils/misc/etc/gpcheck.cnf |  48 +++-
 tools/bin/gpcheck                      | 365 ++++++++++++++++++++++++++--
 tools/bin/gppylib/gpcheckutil.py       |  17 +-
 tools/bin/hawqpylib/hawqlib.py         |  18 ++
 tools/sbin/gpcheck_hostdump            |  50 +++-
 5 files changed, 472 insertions(+), 26 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/e74109bf/src/backend/utils/misc/etc/gpcheck.cnf
----------------------------------------------------------------------
diff --git a/src/backend/utils/misc/etc/gpcheck.cnf b/src/backend/utils/misc/etc/gpcheck.cnf
index 9ccac0d..9d36de6 100644
--- a/src/backend/utils/misc/etc/gpcheck.cnf
+++ b/src/backend/utils/misc/etc/gpcheck.cnf
@@ -40,12 +40,11 @@ hard.nproc  = 131072
 diskusage.monitor.mounts = /
 diskusage.monitor.usagemax = 90%
 
-[hdfs]
+[hdfs.base]
 dfs.mem.namenode.heap = 40960
 dfs.mem.datanode.heap = 6144
 # in hdfs-site.xml
 dfs.support.append = true
-dfs.client.enable.read.from.local = true
 dfs.block.local-path-access.user = gpadmin
 dfs.datanode.max.transfer.threads = 40960
 dfs.client.socket-timeout = 300000000
@@ -54,5 +53,48 @@ dfs.namenode.handler.count = 60
 ipc.server.handler.queue.size = 3300
 dfs.datanode.handler.count = 60
 ipc.client.connection.maxidletime = 3600000
-dfs.namenode.accesstime.precision = -1
+dfs.namenode.accesstime.precision = 0
+dfs.client.read.shortcircuit = true
 
+[hdfs.non]
+dfs.block.access.token.enable = FALSE
+
+[hdfs.ha]
+dfs.block.access.token.enable = FALSE
+
+[hdfs.kerberos]
+dfs.block.access.token.enable = TRUE
+dfs.datanode.data.dir.perm = 750
+
+[hdfs.ha.kerberos]
+dfs.block.access.token.enable = TRUE
+
+[yarn.base]
+yarn.resourcemanager.scheduler.class = org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler
+
+[yarn.non]
+
+[yarn.ha]
+
+[yarn.kerberos]
+hadoop.security.authentication = kerberos
+hadoop.proxyuser.yarn.groups = *
+hadoop.proxyuser.yarn.hosts = *
+hadoop.proxyuser.postgres.hosts = *
+hadoop.proxyuser.postgres.groups = *
+
+[yarn.ha.kerberos]
+hadoop.security.authentication = kerberos
+hadoop.proxyuser.yarn.groups = *
+hadoop.proxyuser.yarn.hosts = *
+hadoop.proxyuser.postgres.hosts = *
+hadoop.proxyuser.postgres.groups = *
+
+[hawq.base]
+dfs.client.read.shortcircuit = true
+
+[hawq.kerberos]
+hadoop.security.authentication = kerberos
+
+[hawq.yarn]
+hawq_global_rm_type = yarn

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/e74109bf/tools/bin/gpcheck
----------------------------------------------------------------------
diff --git a/tools/bin/gpcheck b/tools/bin/gpcheck
index aefe499..1d0019c 100755
--- a/tools/bin/gpcheck
+++ b/tools/bin/gpcheck
@@ -26,8 +26,10 @@ try:
     from gppylib.commands.unix import getLocalHostname, getUserName, SYSTEM
     from gppylib.commands.base import WorkerPool, Command, REMOTE
     from gppylib.gpcheckutil import HostType, hosttype_str
+    from hawqpylib.hawqlib import remote_ssh_output
     from pgdb import DatabaseError
     import pg
+    import stat
 
 except ImportError, e:    
     sys.exit('Cannot import modules.  Please check that you have sourced greenplum_path.sh.
 Detail: ' + str(e))
@@ -89,15 +91,29 @@ class GpCheckConfig:
         self.hdfs_expected = { # default value for HDFS configuration
                 "dfs.mem.namenode.heap": 8192,
                 "dfs.mem.datanode.heap": 8192 }
+        self.hdfs_non_expected = {}
+        self.hdfs_ha_expected = {}
+        self.hdfs_kerberos_expected = {}
+        self.hdfs_ha_kerberos_expected = {}
+
+        self.yarn_expected = {}
+        self.yarn_non_expected = {}
+        self.yarn_ha_expected = {}
+        self.yarn_kerberos_expected = {}
+        self.yarn_ha_kerberos_expected = {}
+
+        self.hawq_expected = {}
+        self.hawq_kerberos_expected = {}
+        self.hawq_yarn_expected = {}
+
 
     def readConfigFile(self, config_file):
         parsed_list = self.parser.read(config_file)
         if len(parsed_list) != 1:
             raise GpCheckError("cannot open file!")
 
-        for required_section in ("linux.sysctl", "hdfs"):
-            if not self.parser.has_section(required_section):
-                raise GpCheckError("require section '%s'" % required_section)
+        if not self.parser.has_section("linux.sysctl"):
+            raise GpCheckError("require section 'linux.sysctl'")
 
         section = "global"
         if self.parser.has_option(section, "configfile_version"):
@@ -136,15 +152,75 @@ class GpCheckConfig:
                 raise GpCheckError("Bad config entry value '%s' for 'diskusage.monitor.usagemax':
%s" %
                         (self.diskusage_usagemax, e))
 
-        section = 'hdfs'
-        for opt in self.parser.options(section):
-            self.hdfs_expected[opt] = self.parser.get(section, opt)
-        try:
-            self.hdfs_expected["dfs.mem.namenode.heap"] = int(self.hdfs_expected["dfs.mem.namenode.heap"])
-            self.hdfs_expected["dfs.mem.datanode.heap"] = int(self.hdfs_expected["dfs.mem.datanode.heap"])
-        except ValueError, e:
-            raise GpCheckError("'dfs.mem.namenode.heap' or 'dfs.mem.namenode.heap' should
be a number: %s" % e)
+        if not self.parser.has_section('hdfs.base'):
+            if not self.parser.has_section("hdfs"):
+                raise GpCheckError("require section 'hdfs'")
+
+            section = 'hdfs'
+            for opt in self.parser.options(section):
+                self.hdfs_expected[opt] = self.parser.get(section, opt)
+            try:
+                self.hdfs_expected["dfs.mem.namenode.heap"] = int(self.hdfs_expected["dfs.mem.namenode.heap"])
+                self.hdfs_expected["dfs.mem.datanode.heap"] = int(self.hdfs_expected["dfs.mem.datanode.heap"])
+            except ValueError, e:
+                raise GpCheckError("'dfs.mem.namenode.heap' or 'dfs.mem.namenode.heap' should
be a number: %s" % e)
+        else:
+            section = 'hdfs.base'
+            for opt in self.parser.options(section):
+                self.hdfs_expected[opt] = self.parser.get(section, opt)
+            try:
+                self.hdfs_expected["dfs.mem.namenode.heap"] = int(self.hdfs_expected["dfs.mem.namenode.heap"])
+                self.hdfs_expected["dfs.mem.datanode.heap"] = int(self.hdfs_expected["dfs.mem.datanode.heap"])
+            except ValueError, e:
+                raise GpCheckError("'dfs.mem.namenode.heap' or 'dfs.mem.namenode.heap' should
be a number: %s" % e)
+
+            section = 'hdfs.non'
+            for opt in self.parser.options(section):
+                self.hdfs_non_expected[opt] = self.parser.get(section, opt)
+
+            section = 'hdfs.ha'
+            for opt in self.parser.options(section):
+                self.hdfs_ha_expected[opt] = self.parser.get(section, opt)
+
+            section = 'hdfs.kerberos'
+            for opt in self.parser.options(section):
+                self.hdfs_kerberos_expected[opt] = self.parser.get(section, opt)
+
+            section = 'hdfs.ha.kerberos'
+            for opt in self.parser.options(section):
+                self.hdfs_ha_kerberos_expected[opt] = self.parser.get(section, opt)
 
+            section = 'yarn.base'
+            for opt in self.parser.options(section):
+                self.yarn_expected[opt] = self.parser.get(section, opt)
+
+            section = 'yarn.non'
+            for opt in self.parser.options(section):
+                self.yarn_non_expected[opt] = self.parser.get(section, opt)
+
+            section = 'yarn.ha'
+            for opt in self.parser.options(section):
+                self.yarn_ha_expected[opt] = self.parser.get(section, opt)
+
+            section = 'yarn.kerberos'
+            for opt in self.parser.options(section):
+                self.yarn_kerberos_expected[opt] = self.parser.get(section, opt)
+
+            section = 'yarn.ha.kerberos'
+            for opt in self.parser.options(section):
+                self.yarn_ha_kerberos_expected[opt] = self.parser.get(section, opt)
+
+            section = 'hawq.base'
+            for opt in self.parser.options(section):
+                self.hawq_expected[opt] = self.parser.get(section, opt)
+
+            section = 'hawq.kerberos'
+            for opt in self.parser.options(section):
+                self.hawq_kerberos_expected[opt] = self.parser.get(section, opt)
+
+            section = 'hawq.yarn'
+            for opt in self.parser.options(section):
+                self.hawq_yarn_expected[opt] = self.parser.get(section, opt)
 
 ###### Global Variables #############
 logger = get_default_logger()
@@ -176,6 +252,16 @@ def checkPlatform():
         raise GpCheckError("No tests exists for this platform in gpcheck")
 
 
+def parse_host_list_file(host_file):
+    host_list = list()
+    with open(host_file) as f:
+        hosts = f.readlines()
+    for host in hosts:
+        host = host.split("#",1)[0].strip()
+        if host:
+            host_list.append(host)
+    return host_list
+
 def parseargs():
     global options, GPHOME, HADOOP_HOME, GPCHECK_CONFIG_FILE
 
@@ -188,7 +274,12 @@ def parseargs():
     parser.add_option('--zipin', type='string')
     parser.add_option('--gphome', type='string')
     # for HDFS xml and memory check
-    parser.add_option('--hadoop', type='string')
+    parser.add_option('--hadoop', '--hadoop-home', type='string')
+    parser.add_option('--hdfs', action='store_true')
+    parser.add_option('--hdfs-ha', dest="hdfs_ha", action='store_true')
+    parser.add_option('--yarn', action='store_true')
+    parser.add_option('--yarn-ha', dest="yarn_ha", action='store_true')
+    parser.add_option('--kerberos', action='store_true')
 
     parser.add_option('-c', '--config', type='string') # optional: gpcheck config file path
     parser.add_option('-f', '--file',  type='string')  # host file, for testing a list of
hosts
@@ -212,6 +303,10 @@ def parseargs():
     if not HADOOP_HOME:
         checkFailed(None, "utility will SKIP HDFS configuration check because HADOOP_HOME
is not specified in environment variable or --hadoop")
 
+    if options.yarn and not HADOOP_HOME:
+        options.yarn = False
+        checkFailed(None, "utility will SKIP YARN configuration check because HADOOP_HOME
is not specified in environment variable or --hadoop")
+
     # params check
     if not options.file and not options.host and not options.zipin:
         raise GpCheckError(" --file or --host or --zipin must be specified")
@@ -242,6 +337,7 @@ def checkFailed(host, msg):
 
 def getHDFSNamenodeHost():
     core_site_file = os.path.join(HADOOP_HOME, "etc/hadoop/core-site.xml")
+    hdfs_site_file = os.path.join(HADOOP_HOME, "etc/hadoop/hdfs-site.xml")
     logger.info("try to detect namenode from %s" % core_site_file)
 
     # for processing property xml
@@ -255,12 +351,42 @@ def getHDFSNamenodeHost():
     for node in xmldoc.getElementsByTagName('property'):
         if getPropName(node) == 'fs.default.name' or getPropName(node) == 'fs.defaultFS':
             fsurl = getPropValue(node).strip()
-            namenode_addr = re.search(r"//([^:/]*)", fsurl).group(1)
+            namenode_list_alias = re.search(r"//([^:/]*)", fsurl).group(1)
+            if_ha_disabled = re.search(".*:[0-9]+$", fsurl)
+            if if_ha_disabled:
+                namenode_addr = namenode_list_alias
+            else:
+                namenode_addr = ''
             break
 
     # run hostname command on remote to get actual hostname
     if namenode_addr == '':
-        logger.error("cannot detect namenode from %s" % core_site_file)
+        ha_namenode_list = ''
+        default_namenode_alias = ''
+        with open(hdfs_site_file) as f:
+            xmldoc = minidom.parse(f)
+        for node in xmldoc.getElementsByTagName('property'):
+            if re.search('dfs.ha.namenodes.*', getPropName(node).strip()):
+                ha_namenode_list = getPropValue(node).strip()
+                default_namenode_alias = ha_namenode_list.split(',')[0].strip()
+                break
+
+        if ha_namenode_list == '':
+            logger.error("cannot detect namenode from %s" % core_site_file)
+            raise GpCheckError("cannot detect namenode from %s" % core_site_file)
+            #sys.exit(1)
+        else:
+            with open(hdfs_site_file) as f:
+                xmldoc = minidom.parse(f)
+            for node in xmldoc.getElementsByTagName('property'):
+                namenode_rpc_address = "dfs.namenode.rpc-address.%s.%s" % (namenode_list_alias,
+                                                                           default_namenode_alias)
+                if getPropName(node) == namenode_rpc_address:
+                    default_namenode_rpc_address = getPropValue(node).strip()
+                    namenode_addr = default_namenode_rpc_address.split(':')[0].strip()
+
+    if namenode_addr == '':
+        raise GpCheckError("cannot detect namenode from %s" % core_site_file)
     else:
         cmd = Command(namenode_addr, "hostname", REMOTE, namenode_addr)
         pool.addCommand(cmd)
@@ -345,10 +471,12 @@ def runCollectionOnServers():
         else:
             raise GpCheckError("unsupported host type")
 
-        cmd = "%s/sbin/gpcheck_hostdump %s" % (GPHOME, host_type_cl)
+        cmd = "%s/sbin/gpcheck_hostdump --hawq %s" % (GPHOME, host_type_cl)
         cmd += " --sysctl %s" % ",".join(gpcheck_config.sysctl_expected.keys())
         if HADOOP_HOME:
             cmd += " --hadoop %s" % HADOOP_HOME
+        if options.yarn or options.yarn_ha:
+            cmd += " --yarn"
         return cmd
 
     try:
@@ -537,7 +665,7 @@ def testSolarisEtcUserAttr(host):
         checkFailed(host.hostname, "/etc/user_attr is missing expected line '%s'" % line)
  
 
-def testHAWQ(host):
+def testHAWQGUC(host):
     if not gpcheck_info.hawq_collected_ok:
         return
 
@@ -567,7 +695,7 @@ def testHAWQ(host):
             return
 
         # check HAWQ master's memory size
-        expected_vmemory_size = 1024
+        expected_vmemory_size = 8192
         if guc_vmemsize_master != expected_vmemory_size:
             checkFailed(host.hostname, "HAWQ master's %s GUC value is %s, expected %s" %
(
                 HAWQ_GUC_MEMORY, guc_vmemsize_master, expected_vmemory_size))
@@ -582,7 +710,7 @@ def testHAWQ(host):
             logger.warning("please change the expected data node memory 'dfs.mem.datanode.heap'
in gpcheck.cnf file")
             logger.warning("SKIP '%s' check" %(HAWQ_GUC_MEMORY))
             return
-        expect_vmemsize_per_segment = 1024
+        expect_vmemsize_per_segment = 8192 
         if guc_vmemsize_master != expect_vmemsize_per_segment:
             checkFailed(host.hostname, "HAWQ segment's %s GUC value on this host is %s, expected
%s" % (
                 HAWQ_GUC_MEMORY, guc_vmemsize_master, expect_vmemsize_per_segment))
@@ -602,6 +730,120 @@ def testDiskCapacity(host):
     return
 
 
+def testHAWQconfig(host):
+    hawq = host.data.hawq
+    hdfs = host.data.hdfs
+    if hawq is None:
+        return # skip HAWQ test when hawq is None
+
+    if options.verbose:
+        logger.info("-- test HAWQ config")
+
+    if hawq.errormsg:
+        checkFailed(host.hostname, "collect HAWQ configuration error: %s" % hawq.errormsg)
+        return
+
+    datanode_list = parse_host_list_file("%s/etc/hadoop/slaves" % HADOOP_HOME)
+    is_datanode = False
+    if host.hostname in datanode_list:
+        is_datanode = True
+
+    expect_config = gpcheck_config.hawq_expected
+
+    if options.kerberos:
+        expect_config.update(gpcheck_config.hawq_kerberos_expected)
+
+    if options.yarn or options.yarn_ha:
+        expect_config.update(gpcheck_config.hawq_yarn_expected)
+
+    actual_config = hawq.site_config
+    hdfs_actual_config = hdfs.site_config
+
+    for exp_key, exp_val in expect_config.items():
+        if exp_key not in actual_config:
+            checkFailed(host.hostname, "HAWQ configuration missing: '%s' needs to be set
to '%s'" % (exp_key, exp_val))
+
+        else:
+            actual_val = actual_config[exp_key]
+            et = (exp_key, exp_val, actual_val)
+
+            if exp_key == "dfs.block.local-path-access.user":
+                if exp_val not in actual_val.split(','):
+                    checkFailed(host.hostname, "HDFS configuration: '%s' should include user
'%s', actual value is '%s'" % et)
+            elif exp_key == "dfs.namenode.handler.count":
+                if int(exp_val) > int(actual_val):
+                    checkFailed(host.hostname, "HDFS configuration: '%s' should be at least
'%s', actual value is '%s'" % et)
+            else:
+                if exp_val != actual_val:
+                    checkFailed(host.hostname, "HAWQ configuration: expected '%s' for '%s',
actual value is '%s'" % et)
+
+    if not options.kerberos:
+        if 'hadoop.security.authentication' in actual_config:
+            if actual_config['hadoop.security.authentication'] != 'simple':
+                checkFailed(host.hostname, "HAWQ configuration: expected '%s' for '%s', actual
value is '%s'" % ('simple', 'hadoop.security.authentication', actual_config[hadoop.security.authentication]))
+
+        if 'hadoop.security.authentication' in hdfs_actual_config:
+            if hdfs_actual_config['hadoop.security.authentication'] != 'simple':
+                checkFailed(host.hostname, "HAWQ configuration: expected '%s' for '%s', actual
value is '%s'" % ('simple', 'hadoop.security.authentication', hdfs_actual_config[hadoop.security.authentication]))
+
+    if options.yarn or options.yarn_ha:
+        hawq_yarn_property_exist_list = ['hawq_rm_yarn_address', 'hawq_rm_yarn_scheduler_address',
'hawq_rm_yarn_app_name']
+        for item in hawq_yarn_property_exist_list:
+            if item in actual_config:
+                if not actual_config[item]:
+                        checkFailed(host.hostname, "HAWQ configuration: yarn.resourcemanager.address
is empty")
+            else:
+                checkFailed(host.hostname, "HAWQ configuration: yarn.resourcemanager.address
not defined")
+
+    if 'dfs.client.read.shortcircuit' not in actual_config:
+        checkFailed(host.hostname, "HAWQ configuration dfs.client.read.shortcircuit not defined")
+
+    if 'dfs.client.read.shortcircuit' not in hdfs_actual_config:
+        checkFailed(host.hostname, "HAWQ configuration dfs.client.read.shortcircuit not defined")
+
+    if 'dfs.domain.socket.path' not in actual_config:
+        checkFailed(host.hostname, "HAWQ configuration dfs.domain.socket.path not defined")
+
+    if 'dfs.domain.socket.path' not in hdfs_actual_config:
+        checkFailed(host.hostname, "HDFS configuration dfs.domain.socket.path not defined")
+
+    if is_datanode and 'dfs.domain.socket.path' in actual_config and 'dfs.domain.socket.path'
in hdfs_actual_config:
+        if actual_config['dfs.domain.socket.path'] != hdfs_actual_config['dfs.domain.socket.path']:
+            checkFailed(host.hostname, "HAWQ configuration: dfs.domain.socket.path expect
to have the same value with HDFS configuration")
+        else:
+            cmd = "ls -l %s" % actual_config['dfs.domain.socket.path']
+            (result, output, errmsg) = remote_ssh_output(cmd, host.hostname, '')
+            if result == 0:
+                if output.split(' ')[0][7:9] != 'rw':
+                    checkFailed(host.hostname, "HAWQ configuration dfs.domain.socket.path:
%s should have R/W access for both hawq and HDFS on %s" % (actual_config['dfs.domain.socket.path'],
host.hostname))
+            else:
+                checkFailed(host.hostname, "HAWQ configuration dfs.domain.socket.path: %s,
does not exist on %s" % (actual_config['dfs.domain.socket.path'], host.hostname))
+
+    if 'output.replace-datanode-on-failure' in actual_config:
+        if len(datanode_list) < 4:
+            if actual_config['output.replace-datanode-on-failure'] == 'true':
+                checkFailed(host.hostname, "HAWQ configuration: output.replace-datanode-on-failure
expect false, current is true")
+        else:
+            if actual_config['output.replace-datanode-on-failure'] == 'false':
+                checkFailed(host.hostname, "HAWQ configuration: output.replace-datanode-on-failure
expect true, current is false")
+    else:
+        checkFailed(host.hostname, "HAWQ configuration: output.replace-datanode-on-failure
not defined")
+
+
+def testDiskCapacity(host):
+    if options.verbose:
+        logger.info("-- test Disk Capacity")
+
+    for line in host.data.diskusage.lines:
+        if len(gpcheck_config.diskusage_mounts) == 0 or line.mount in gpcheck_config.diskusage_mounts:
+            actual_usage = int(line.used_percent[:-1])
+            if actual_usage > gpcheck_config.diskusage_usagemax:
+                checkFailed(host.hostname,
+                           "potential disk full risk: %s mounted on %s has used %s space"
% (
+                               line.fs, line.mount, line.used_percent))
+    return
+
+
 def testHDFSConfig(host):
     hdfs = host.data.hdfs
     if hdfs is None:
@@ -615,6 +857,30 @@ def testHDFSConfig(host):
         return
 
     expect_config = gpcheck_config.hdfs_expected
+
+    if not options.hdfs_ha and not options.kerberos:
+        expect_config.update(gpcheck_config.hdfs_non_expected)
+
+    if options.hdfs_ha and not options.kerberos:
+        expect_config.update(gpcheck_config.hdfs_ha_expected)
+
+    if options.kerberos and not options.hdfs_ha:
+        expect_config.update(gpcheck_config.hdfs_kerberos_expected)
+
+    if options.kerberos and options.hdfs_ha:
+        expect_config.update(gpcheck_config.hdfs_ha_kerberos_expected)
+
+
+    if options.yarn or options.yarn_ha:
+        expect_config.update(gpcheck_config.yarn_expected)
+        if not options.yarn_ha and not options.kerberos:
+            expect_config.update(gpcheck_config.yarn_non_expected)
+
+        if options.yarn_ha:
+            expect_config.update(gpcheck_config.yarn_ha_expected)
+        if options.kerberos:
+            expect_config.update(gpcheck_config.yarn_kerberos_expected)
+
     actual_config = hdfs.site_config
     actual_heap_size = hdfs.namenode_heap_size if host.is_namenode else hdfs.datanode_heap_size
 
@@ -658,6 +924,64 @@ def testHDFSConfig(host):
                                   (actual_heap_size, expect_datanode_heap))
 
 
+    # Check if nodemanager direcotries exists
+    directory_check_list = []
+    datanode_list = parse_host_list_file("%s/etc/hadoop/slaves" % HADOOP_HOME)
+    is_datanode = False
+    if host.hostname in datanode_list:
+        is_datanode = True
+
+    if options.yarn or options.yarn_ha:
+        yarn_enabled = True
+    else:
+        yarn_enabled = False
+
+    if yarn_enabled and is_datanode:
+        if 'yarn.nodemanager.local-dirs' in actual_config: 
+            directory_check_list += actual_config['yarn.nodemanager.local-dirs'].split(',')
+        else:
+            checkFailed(host.hostname, "YARN configuration: yarn.nodemanager.local-dirs not
defined")
+
+        if 'yarn.nodemanager.log-dirs' in actual_config: 
+            directory_check_list += actual_config['yarn.nodemanager.log-dirs'].split(',')
+        else:
+            checkFailed(host.hostname, "YARN configuration: yarn.nodemanager.log-dirs not
defined")
+
+    for directory in directory_check_list:
+        cmd = "test -e %s" % directory
+        (result, output, errmsg) = remote_ssh_output(cmd, host.hostname, '')
+        if result != 0:
+            checkFailed(host.hostname, "YARN nodemanager directory %s does not exist" % directory)
+
+    # Check if resource manager property exists
+    if options.yarn:
+        yarn_property_exist_list = ['yarn.resourcemanager.address', 'yarn.resourcemanager.scheduler.address']
+
+    if options.yarn_ha:
+        yarn_property_exist_list = ['yarn.resourcemanager.address.rm1', 'yarn.resourcemanager.address.rm2',
'yarn.resourcemanager.scheduler.address.rm1', \
+                                    'yarn.resourcemanager.scheduler.address.rm2']
+
+    if yarn_enabled:
+        for item in yarn_property_exist_list:
+            if item in actual_config:
+                if not actual_config[item]:
+                        checkFailed(host.hostname, "YARN configuration: %s is empty" % item)
+            else:
+                checkFailed(host.hostname, "YARN configuration: %s not defined" % item)
+
+    # Check yarn kerberos properties
+    #yarn_kerberos_check_list = ['hadoop.proxyuser.yarn.groups', 'hadoop.proxyuser.yarn.hosts',
'hadoop.proxyuser.postgres.hosts', 'hadoop.proxyuser.postgres.groups']
+    if yarn_enabled and options.kerberos:
+        yarn_kerberos_check_list = ['yarn.nodemanager.keytab', 'yarn.nodemanager.principal','hadoop.proxyuser.postgres.groups',
\
+                                    'yarn.resourcemanager.keytab', 'yarn.resourcemanager.principal']
+        for item in yarn_kerberos_check_list:
+            if item in actual_config:
+                if not actual_config[item]:
+                        checkFailed(host.hostname, "YARN configuration: %s is empty, expected
non-empty" % item)
+            else:
+                checkFailed(host.hostname, "YARN configuration missing: %s" % item)
+
+
 def testIOSchedulers(host):
     if options.verbose:
         logger.info("-- test IO scheduler")
@@ -774,6 +1098,8 @@ def testNtp(host):
 def testGenericLinuxHost(host):
     logger.info("test on host: %s" % host.hostname)
     if host.is_namenode:
+        testHAWQGUC(host)
+        testHAWQconfig(host)
         testHDFSConfig(host)
         testDiskCapacity(host)
         testSysctl(host)
@@ -782,7 +1108,8 @@ def testGenericLinuxHost(host):
         testNtp(host)
 
     else:
-        testHAWQ(host)
+        testHAWQGUC(host)
+        testHAWQconfig(host)
         testDiskCapacity(host)
         testHDFSConfig(host)
         testIOSchedulers(host)

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/e74109bf/tools/bin/gppylib/gpcheckutil.py
----------------------------------------------------------------------
diff --git a/tools/bin/gppylib/gpcheckutil.py b/tools/bin/gppylib/gpcheckutil.py
index 9956990..3419bb2 100755
--- a/tools/bin/gppylib/gpcheckutil.py
+++ b/tools/bin/gppylib/gpcheckutil.py
@@ -151,6 +151,20 @@ class hdfs:
             return "============= HDFS ==========================\n" + output
 
 
+class hawq:
+    def __init__(self):
+        self.site_config = dict()
+        self.errormsg = None
+
+    def __str__(self):
+        if self.errormsg:
+            return "============= HAWQ ERROR ====================\n" + self.errormsg
+        else:
+            output  = "HAWQ checks \n" 
+            output += "\n".join(["%s = %s" % (k, self.site_config[k]) for k in sorted(self.site_config.iterkeys())])
+            return "============= HAWQ ==========================\n" + output
+
+
 class diskusage_entry:
     def __init__(self, fs, size, used, avail, used_percent, mount):
         self.fs = fs
@@ -336,6 +350,7 @@ class GenericLinuxOutputData:
         self.uname = None
         self.machine = None
         self.hdfs = None
+        self.hawq = None
         self.diskusage = None
         self.sysctl = None
         self.limitsconf = None
@@ -346,7 +361,7 @@ class GenericLinuxOutputData:
 
     def __str__(self):
         applied_checks = filter(lambda x: x is not None,
-                                [ self.uname, self.machine, self.hdfs, self.diskusage, self.sysctl,
+                                [ self.uname, self.machine, self.hdfs, self.hawq, self.diskusage,
self.sysctl,
                                   self.limitsconf, self.mounts, self.ioschedulers, self.blockdev,
self.ntp ])
         return "\n".join(map(str, applied_checks))
 

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/e74109bf/tools/bin/hawqpylib/hawqlib.py
----------------------------------------------------------------------
diff --git a/tools/bin/hawqpylib/hawqlib.py b/tools/bin/hawqpylib/hawqlib.py
index ae0d852..c149ffc 100755
--- a/tools/bin/hawqpylib/hawqlib.py
+++ b/tools/bin/hawqpylib/hawqlib.py
@@ -131,6 +131,24 @@ def check_property_exist_xml(xml_file, property_name):
     return property_exist, property_name, property_value
 
 
+def get_xml_values(xmlfile):
+    xml_dict = {}
+    with open(xmlfile) as f:
+        xmldoc = minidom.parse(f)
+
+    for node in xmldoc.getElementsByTagName('property'):
+        name = node.getElementsByTagName('name')[0].childNodes[0].data.encode('ascii')
+
+        try:
+            value = node.getElementsByTagName('value')[0].childNodes[0].data.encode('ascii')
+        except:
+            value = None
+
+        xml_dict[name] = value
+
+    return xml_dict
+
+
 class HawqXMLParser:
     def __init__(self, GPHOME):
         self.GPHOME = GPHOME

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/e74109bf/tools/sbin/gpcheck_hostdump
----------------------------------------------------------------------
diff --git a/tools/sbin/gpcheck_hostdump b/tools/sbin/gpcheck_hostdump
index 7714cc3..28f074e 100755
--- a/tools/sbin/gpcheck_hostdump
+++ b/tools/sbin/gpcheck_hostdump
@@ -31,7 +31,7 @@ try:
     from gppylib.gpparseopts import OptParser, OptChecker
     from gppylib.gpcheckutil import ApplianceOutputData, GenericLinuxOutputData, GenericSolarisOutputData
     from gppylib.gpcheckutil import chkconfig, omreport, grubconf, mounts, GpMount, GpMount,
inittab, ntp
-    from gppylib.gpcheckutil import securetty, ioschedulers, blockdev, bcu, rclocal, sysctl,
limitsconf, limitsconf_entry, uname, connectemc, diskusage, diskusage_entry, hdfs, machine
+    from gppylib.gpcheckutil import securetty, ioschedulers, blockdev, bcu, rclocal, sysctl,
limitsconf, limitsconf_entry, uname, connectemc, diskusage, diskusage_entry, hdfs, hawq, 
machine
     from gppylib.gpcheckutil import solaris_etc_system, solaris_etc_project, solaris_etc_user_attr
 except ImportError, e:    
     sys.exit('Cannot import modules.  Please check that you have sourced greenplum_path.sh.
 Detail: ' + str(e))
@@ -413,15 +413,53 @@ def collectCPUandMemoryInfo():
     return data
 
 
+def collectHAWQ():
+    if not options.hawq:
+        return None
+    data = hawq()
+    hawq_config_dir = os.environ.get('GPHOME')
+    if hawq_config_dir is None:
+        print "Please export GPHOME first, exit"
+        sys.exit(1)
+    hdfs_client_file = os.path.join(hawq_config_dir, "etc/hdfs-client.xml")
+    yarn_client_file = os.path.join(hawq_config_dir, "etc/yarn-client.xml")
+    hawq_site_file = os.path.join(hawq_config_dir, "etc/hawq-site.xml")
+
+    # collect HDFS site config
+    getPropName =  lambda node: node.getElementsByTagName('name')[0].childNodes[0].data
+    getPropValue = lambda node: node.getElementsByTagName('value')[0].childNodes[0].data
+    hawq_config_file_list = [hdfs_client_file, hawq_site_file]
+    if options.yarn:
+        hawq_config_file_list.append(yarn_client_file)
+    for filename in hawq_config_file_list:
+        try:
+            with open(filename) as f:
+                xmldoc = minidom.parse(f)
+            for node in xmldoc.getElementsByTagName('property'):
+                try:
+                    data.site_config[getPropName(node)] = getPropValue(node).strip()
+                except IndexError:
+                    pass # the <value> tag may be empty, which causes IndexError in
getPropValue
+
+        except Exception, e:
+            data.errormsg = "Failed to read HAWQ config file '%s': %s" % (filename, e)
+
+    return data
+
+
 def collectHDFS():
     if not options.hadoop:
         return None
     data = hdfs()
+    hawq_config_dir = os.environ.get('GPHOME')
+    if hawq_config_dir is None:
+        print "Please export GPHOME first, exit"
+        sys.exit(1)
     hadoop_config_file = os.path.join(options.hadoop, "libexec/hadoop-config.sh")
     hadoop_env_file = os.path.join(options.hadoop, "etc/hadoop/hadoop-env.sh")
     hdfs_site_file = os.path.join(options.hadoop, "etc/hadoop/hdfs-site.xml")
+    yarn_site_file = os.path.join(options.hadoop, "etc/hadoop/yarn-site.xml")
     core_site_file = os.path.join(options.hadoop, "etc/hadoop/core-site.xml")
-    libhdfs3_site_file = os.environ.get("LIBHDFS3_CONF")
 
     # collect java heap size config
     p = subprocess.Popen(". %s; echo $JAVA_HEAP_MAX" % hadoop_config_file, shell = True,
@@ -457,7 +495,10 @@ def collectHDFS():
     # collect HDFS site config
     getPropName =  lambda node: node.getElementsByTagName('name')[0].childNodes[0].data
     getPropValue = lambda node: node.getElementsByTagName('value')[0].childNodes[0].data
-    for filename in (hdfs_site_file, core_site_file, libhdfs3_site_file):
+    hdfs_config_file_list = [hdfs_site_file, core_site_file]
+    if options.yarn:
+        hdfs_config_file_list.append(yarn_site_file)
+    for filename in hdfs_config_file_list:
         try:
             with open(filename) as f:
                 xmldoc = minidom.parse(f)
@@ -804,6 +845,7 @@ def processGenericLinuxServer():
     output = GenericLinuxOutputData()
 
     output.hdfs = collectHDFS()
+    output.hawq = collectHAWQ()
     output.uname = collectUname()
     output.machine = collectCPUandMemoryInfo()
     output.diskusage = collectDiskUsage()
@@ -844,6 +886,8 @@ def parseargs():
     parser.remove_option('-h')
     parser.add_option('-h', '-?', '--help', action='store_true')
     parser.add_option('--hadoop', type='string')
+    parser.add_option('--hawq', action='store_true')
+    parser.add_option('--yarn', action='store_true')
     parser.add_option('--sysctl', type='string')
     parser.add_option('--appliance',  action='store_true')
     parser.add_option('--linux',  action='store_true')


Mime
View raw message