From dev-return-2978-archive-asf-public=cust-asf.ponee.io@singa.incubator.apache.org Mon Jul 8 03:54:17 2019 Return-Path: X-Original-To: archive-asf-public@cust-asf.ponee.io Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [207.244.88.153]) by mx-eu-01.ponee.io (Postfix) with SMTP id 5629718063F for ; Mon, 8 Jul 2019 05:54:17 +0200 (CEST) Received: (qmail 14637 invoked by uid 500); 8 Jul 2019 03:54:16 -0000 Mailing-List: contact dev-help@singa.incubator.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@singa.incubator.apache.org Delivered-To: mailing list dev@singa.incubator.apache.org Received: (qmail 14618 invoked by uid 99); 8 Jul 2019 03:54:16 -0000 Received: from ec2-52-202-80-70.compute-1.amazonaws.com (HELO gitbox.apache.org) (52.202.80.70) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 08 Jul 2019 03:54:16 +0000 From: GitBox To: dev@singa.apache.org Subject: [GitHub] [incubator-singa] nudles commented on a change in pull request #468: Distributted module Message-ID: <156255805596.16904.18398593000061401852.gitbox@gitbox.apache.org> Date: Mon, 08 Jul 2019 03:54:15 -0000 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit nudles commented on a change in pull request #468: Distributted module URL: https://github.com/apache/incubator-singa/pull/468#discussion_r300913547 ########## File path: python/singa/dist_opt.py ########## @@ -1,27 +1,32 @@ from . import singa_wrap as singa +from .opt import SGD -class Dist_SGD(object): - def __init__(self, lr=0.01, nDev=1): - self.lr=lr - # def start_MPI(): - # pass - # def create_communicator(): - # pass - # could be combined with start_MPI - self.communicator=singa.Communicator(nDev) - self.world_size=self.communicator.totalMPIRanksInGlobal - self.rank_in_local=self.communicator.MPIRankInLocal - self.rank_in_global=self.communicator.MPIRankInGlobal - def dist_update(self, param, grad): - # singa.synch(grad.data, self.communicator) - # grad /= self.communicator.totalMPIRanksInGlobal - grad = self.synch(grad) - param -= grad * self.lr +class DistOpt(object): - def synch(self, tensor): - singa.synch(tensor.data, self.communicator) - tensor /= self.world_size - return tensor - + def __init__(self, opt=SGD(), nDev=1): + # The class is designed to wrap an optimizer to do disttributed training. + # opt: The optimizer to be wrapped. nDev: number of devices(GPUs) a + # process will control/use. + # world_size: total number of processes. + # rank_in_local: local rank of a process on the current node. + # rank_in_global: global rank of a process + + self.opt = opt + self.communicator = singa.Communicator(nDev) + self.world_size = self.communicator.totalMPIRanksInGlobal + self.rank_in_local = self.communicator.MPIRankInLocal + self.rank_in_global = self.communicator.MPIRankInGlobal + + def update(self, param, grad): + # singa.synch(grad.data, self.communicator) + # grad /= self.communicator.totalMPIRanksInGlobal + grad = self.synch(grad) Review comment: synch? just name it as all_reduce? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: users@infra.apache.org With regards, Apache Git Services