From dev-return-5636-archive-asf-public=cust-asf.ponee.io@madlib.apache.org Wed Nov 13 00:58:01 2019 Return-Path: X-Original-To: archive-asf-public@cust-asf.ponee.io Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [207.244.88.153]) by mx-eu-01.ponee.io (Postfix) with SMTP id B7BE918067E for ; Wed, 13 Nov 2019 01:58:00 +0100 (CET) Received: (qmail 9081 invoked by uid 500); 13 Nov 2019 00:58:00 -0000 Mailing-List: contact dev-help@madlib.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@madlib.apache.org Delivered-To: mailing list dev@madlib.apache.org Received: (qmail 9041 invoked by uid 99); 13 Nov 2019 00:57:59 -0000 Received: from ec2-52-202-80-70.compute-1.amazonaws.com (HELO gitbox.apache.org) (52.202.80.70) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 13 Nov 2019 00:57:59 +0000 From: GitBox To: dev@madlib.apache.org Subject: [GitHub] [madlib] kaknikhil commented on a change in pull request #455: DL: Add new helper function for gpu_configuration Message-ID: <157360667964.9443.12958719569841466778.gitbox@gitbox.apache.org> Date: Wed, 13 Nov 2019 00:57:59 -0000 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit kaknikhil commented on a change in pull request #455: DL: Add new helper function for gpu_configuration URL: https://github.com/apache/madlib/pull/455#discussion_r345522240 ########## File path: src/ports/postgres/modules/deep_learning/madlib_keras_gpu_info.py_in ########## @@ -0,0 +1,203 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +@file madlib_keras_gpu_info.py_in + +@brief GPU configuration helper function + +@namespace madlib_keras_gpu_info +""" + +import plpy +from utilities.utilities import is_platform_pg +from utilities.utilities import unique_string +import subprocess +import os + +class OutputInfoSchema: + INFO_TABLE = unique_string(desp='gpu_info') + SEG_ID_COL = 'gp_seg_id' + GPU_DESCR_COL = 'gpu_descr' + + +class Source: + NVIDIA = 'nvidia' + TENSORFLOW = 'tensorflow' + + +class GPUInfoFunctions: + @staticmethod + def get_gpu_info_from_nvidia(): + """ + This function will run only on segment(s). Make sure not to run any non + select plpy execute. + :return: list of gpu descriptions as returned by nvidia-smi -L. + """ + try: + return subprocess.check_output(["nvidia-smi", "-L"]).splitlines() + except OSError: # Handle case when nvidia-smi is not found + return [] + except Exception as ex: # Raise exception for all other cases + plpy.error("Running nvidia-smi failed with exception {0}".format(str(ex))) + + @staticmethod + def get_gpu_info_from_tensorflow(): + """ + This function will run only on segment(s). Make sure not to run any non + select plpy execute. + :return: list of gpu descriptions as returned by tensorflow + """ + current_working_dir = os.path.dirname(os.path.realpath(__file__)) + gpus = subprocess.check_output(["python", "gpu_info_from_tf.py"], + cwd=current_working_dir).splitlines() + return gpus + + +def gpu_configuration(schema_madlib, source): + """ + :return: List of gpus along with their hostname in the format + gpu_descr | hostname + ------------------+-------------------------- + NVIDIA Tesla P100 | pm-demo-machine-keras1 + NVIDIA Tesla P100 | pm-demo-machine-keras1 + Super Duper GPU | pm-demo-machine-keras2 + Super Duper GPU | pm-demo-machine-keras2 + 1. We use gp_dist_random to run either the nvidia smi UDF or the tensorflow UDF + on all the hosts. + 2. Also we do not need to run the tf/nvidia UDF on all the segments, just + one segment per host. That's why we group the output of + gp_segment_configuration by hostname and get the min segment from each host. + 3. To get the hostname along with the gpu description, we have to join the + output of nvidia/tf UDF with gp_segment_configuration and filter out the + following + * master + * mirror segments + * empty/null gpu description + Workflow for gpdb + 1. Run query to get min seg ids on each host. This is so that we can run + the gpu UDF on just one segment per host. + 2. Create a table by running the tf/nvidia UDF on the segment ids returned + from the previous step. Note that this table will only contain the output + of the UDF and the segment id itself. This table does not contain hostnames + 3. To get the hostname associated with the segment id, we need to join the + table created in step with gp_segment_configuration. + It's important to note that we can merge all these 3 queries into one but + the problem with that is that a redistribution happens before running the UDF + which means the UDF does not run on the segments that we pass in to the query. + To avoid this, we broke down the query into 3 parts so that the UDF is always + run on the intended segments. + """ + if not source: + source = Source.TENSORFLOW + source = source.lower() + if source != Source.TENSORFLOW and source != Source.NVIDIA: + plpy.error("DL: source has to be one of {0} or {1}".format( + Source.TENSORFLOW, Source.NVIDIA)) + + gpu_fn_name = 'gpu_info_{0}'.format(source) + if is_platform_pg(): + return gpu_for_postgres(schema_madlib, gpu_fn_name) + else: + return gpu_for_gpdb(schema_madlib, gpu_fn_name) + + +def gpu_for_postgres(schema_madlib, gpu_fn_name): + gpu_info_query = """ + SELECT 'localhost' as hostname, {0} from (SELECT unnest({1}.{2}()) AS {0}) s1 + where {0} is NOT NULL AND {0} != '' + """.format(OutputInfoSchema.GPU_DESCR_COL, schema_madlib, gpu_fn_name) + gpus = plpy.execute(gpu_info_query) + if not gpus or len(gpus) == 0: + return [] + return gpus + + +def gpu_for_gpdb(schema_madlib, gpu_fn_name): + min_seg_on_each_host = get_min_seg_ids_on_each_host() + create_gpu_info_table(schema_madlib, gpu_fn_name, min_seg_on_each_host) + gpus_per_host = get_gpu_info_with_hostname() + + plpy.execute("DROP TABLE IF EXISTS {0}".format(OutputInfoSchema.INFO_TABLE)) + + if not gpus_per_host or len(gpus_per_host) == 0: + return [] + return gpus_per_host + + +def get_min_seg_ids_on_each_host(): + """ + Run query to get min seg ids on each host. This is so that we can run + the gpu UDF on just one segment per host. + :return: List of min seg id per host + """ + min_seg_id_alias = 'min_seg_id' + min_seg_query = """ + SELECT {min_seg_id_alias} FROM + (select hostname, min(content) AS {min_seg_id_alias} + FROM gp_segment_configuration WHERE content != -1 AND role='p' + GROUP BY hostname) min_seg_id_subquery + """.format(**locals()) + min_seg_on_each_host = plpy.execute(min_seg_query) + if not min_seg_on_each_host: Review comment: yeah you are right, i will remove the check ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: users@infra.apache.org With regards, Apache Git Services