singa-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From wan...@apache.org
Subject [1/2] incubator-singa git commit: SINGA-134 Extend SINGA to run over a GPU cluster
Date Tue, 05 Apr 2016 12:50:04 GMT
Repository: incubator-singa
Updated Branches:
  refs/heads/master 914c1e722 -> 8b7d1e09e


SINGA-134 Extend SINGA to run over a GPU cluster

Minor changes to extend the single node Multi-GPU training to the GPU
cluster scenario.
1. remove gethostip which is not stable (only work for some OS). Now users have to config
the hostfile with each line specifying the IP (must be IP) of one node.
2. register process id no matter the running environment.
3. the singa-run.sh has to `source` the '.profile' ('.bashrc' does not wrok) file right before
executing singa, which exports the LD_LIBRARY_PATH for the cudnn and cuda library. There is
no problem if singa is compiled without cuda.

Checked with cpplint.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/a3c82ca9
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/a3c82ca9
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/a3c82ca9

Branch: refs/heads/master
Commit: a3c82ca913859d690f7bbf7b4706686de0b4d2a8
Parents: 914c1e7
Author: Wei Wang <wangwei@comp.nus.edu.sg>
Authored: Sun Apr 3 17:30:17 2016 +0800
Committer: Wei Wang <wangwei@comp.nus.edu.sg>
Committed: Tue Apr 5 17:46:46 2016 +0800

----------------------------------------------------------------------
 bin/singa-run.sh              |  4 ++--
 include/singa/driver.h        | 12 +++++++-----
 include/singa/stub.h          |  5 ++---
 include/singa/utils/cluster.h |  6 ++----
 src/driver.cc                 | 22 ++++++++++++++++------
 src/stub.cc                   | 11 -----------
 src/utils/cluster.cc          |  1 -
 7 files changed, 29 insertions(+), 32 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a3c82ca9/bin/singa-run.sh
----------------------------------------------------------------------
diff --git a/bin/singa-run.sh b/bin/singa-run.sh
index 836342e..9e53ea5 100755
--- a/bin/singa-run.sh
+++ b/bin/singa-run.sh
@@ -86,7 +86,7 @@ singa_run="$exe $args \
 if [ ! -z $job_conf ]; then
   singa_run="$singa_run -conf $job_conf"
 fi
-singa_sshrun="cd $SINGA_HOME; $singa_run"
+singa_sshrun="source ~/.profile; cd $SINGA_HOME; $singa_run"
 
 # ssh and start singa processes
 ssh_options="-oStrictHostKeyChecking=no \
@@ -99,7 +99,7 @@ for i in ${hosts[@]} ; do
     $singa_run &
   else
     echo Executing @ $i : $singa_sshrun
-    ssh $ssh_options $i $singa_sshrun &
+    ssh $ssh_options $i $singa_sshrun " -host " $i &
   fi
 done
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a3c82ca9/include/singa/driver.h
----------------------------------------------------------------------
diff --git a/include/singa/driver.h b/include/singa/driver.h
index fb5a33a..0105158 100644
--- a/include/singa/driver.h
+++ b/include/singa/driver.h
@@ -18,10 +18,11 @@
 * under the License.
 *
 *************************************************************/
-#ifndef SINGA_SINGA_DRIVER_H_
-#define SINGA_SINGA_DRIVER_H_
+#ifndef SINGA_DRIVER_H_
+#define SINGA_DRIVER_H_
 
 #include <vector>
+#include <string>
 #include "singa/proto/job.pb.h"
 #include "singa/proto/singa.pb.h"
 #include "singa/utils/factory.h"
@@ -50,7 +51,7 @@ class Driver {
    * Used for python binding. Users can also directly call it as a C++ API.
    * - init glog with given parameters
    *
-   */   
+   */
   void InitLog(char *arg);
   /**
    * Update job configuration and call Train(const JobProto&) to start the
@@ -74,7 +75,7 @@ class Driver {
    * files.
    * @param[in] str serialized string recorded job configuration.
    */
-  void Train(bool resume, const std::string str); 
+  void Train(bool resume, const std::string str);
   /**
    * Create workers and servers to conduct the training.
    *
@@ -204,6 +205,7 @@ class Driver {
 
  private:
   int job_id_;
+  std::string hostip_;
   JobProto job_conf_;
   SingaProto singa_conf_;
 };
@@ -259,4 +261,4 @@ int Driver::RegisterWorker(const Type& type) {
 
 }  // namespace singa
 
-#endif  // SINGA_SINGA_DRIVER_H_
+#endif  // SINGA_DRIVER_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a3c82ca9/include/singa/stub.h
----------------------------------------------------------------------
diff --git a/include/singa/stub.h b/include/singa/stub.h
index 0ab6fd4..4802535 100644
--- a/include/singa/stub.h
+++ b/include/singa/stub.h
@@ -58,8 +58,8 @@ class Stub {
       const std::vector<Worker*>& workers,
       const std::vector<Server*>& servers);
 
-  const std::string& endpoint() const {
-    return endpoint_;
+  void set_router(Router* router) {
+    router_ = router;
   }
 
  protected:
@@ -100,7 +100,6 @@ class Stub {
 
  protected:
   Router *router_ = nullptr;
-  std::string endpoint_;
   std::vector<int> slice2server_;
 };
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a3c82ca9/include/singa/utils/cluster.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/cluster.h b/include/singa/utils/cluster.h
index c1dc93b..9e36cf8 100644
--- a/include/singa/utils/cluster.h
+++ b/include/singa/utils/cluster.h
@@ -7,9 +7,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
-* 
+*
 *   http://www.apache.org/licenses/LICENSE-2.0
-* 
+*
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -122,7 +122,6 @@ class Cluster {
   inline int ProcsIDOf(int group_id, int id, int flag) {
     return procs_ids_.at(Hash(group_id, id, flag));
   }
-  inline std::string hostip() const { return hostip_; }
 
   /**
    * @param pid, processs ID
@@ -150,7 +149,6 @@ class Cluster {
 
   int procs_id_ = -1;
   int nprocs_ = 0;
-  std::string hostip_ = "";
   // cluster config proto
   ClusterProto cluster_;
   SingaProto singa_;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a3c82ca9/src/driver.cc
----------------------------------------------------------------------
diff --git a/src/driver.cc b/src/driver.cc
index b8f6735..83f3953 100644
--- a/src/driver.cc
+++ b/src/driver.cc
@@ -49,11 +49,11 @@ namespace singa {
 void Driver::Init(int argc, char **argv) {
   // unique job ID generated from singa-run.sh, passed in as "-singa_job <id>"
   int arg_pos = ArgPos(argc, argv, "-singa_job");
-  job_id_ = (arg_pos != -1) ? atoi(argv[arg_pos+1]) : -1;
+  job_id_ = (arg_pos != -1) ? atoi(argv[arg_pos + 1]) : -1;
   // global signa conf passed by singa-run.sh as "-singa_conf <path>"
   arg_pos = ArgPos(argc, argv, "-singa_conf");
   if (arg_pos != -1)
-    ReadProtoFromTextFile(argv[arg_pos+1], &singa_conf_);
+    ReadProtoFromTextFile(argv[arg_pos + 1], &singa_conf_);
   else
     ReadProtoFromTextFile("conf/singa.conf", &singa_conf_);
   // set log path
@@ -62,7 +62,12 @@ void Driver::Init(int argc, char **argv) {
   // job conf passed by users as "-conf <path>"
   arg_pos = ArgPos(argc, argv, "-conf");
   if (arg_pos != -1)
-    ReadProtoFromTextFile(argv[arg_pos+1], &job_conf_);
+    ReadProtoFromTextFile(argv[arg_pos + 1], &job_conf_);
+  arg_pos = ArgPos(argc, argv, "-host");
+  if (arg_pos != -1)
+    hostip_ = argv[arg_pos + 1];
+  else
+    hostip_ = "localhost";
 
   // register layers
 
@@ -222,9 +227,14 @@ void Driver::Train(const JobProto& job_conf) {
   // no need to create Stub if there is only a single worker without servers,
   // i.e., the training will be conducted by the single worker.
   if (grp_size > 1 || nserver_grps > 0) {
-    stub.Setup();
-    // TODO(wangwei)  register endpoint to zookeeper if > 1 procs;
-    cluster->Register(getpid(), stub.endpoint());  // getpid() is from unistd.h
+    auto router = new Router();
+    if (cluster->nprocs() > 1) {
+      int binding_port = router->Bind("tcp://" + hostip_ + ":*");
+      cluster->Register(getpid(), hostip_ + ":" + std::to_string(binding_port));
+    } else {
+      cluster->Register(getpid(), hostip_ + ":0");  // fake endpoint
+    }
+    stub.set_router(router);
   }
 
   NeuralNet* net = NeuralNet::Create(job_conf.neuralnet(), kTrain, grp_size);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a3c82ca9/src/stub.cc
----------------------------------------------------------------------
diff --git a/src/stub.cc b/src/stub.cc
index 4bc8c3d..84c1f8b 100644
--- a/src/stub.cc
+++ b/src/stub.cc
@@ -41,17 +41,6 @@ using std::string;
 Stub::~Stub() {
   delete router_;
 }
-void Stub::Setup() {
-  router_ = new Router();
-  auto cluster = Cluster::Get();
-  if (cluster->nprocs() > 1) {
-    const string hostip = cluster->hostip();
-    int port = router_->Bind("tcp://" + hostip + ":*");
-    endpoint_ = hostip + ":" + std::to_string(port);
-  } else {
-    endpoint_ = "localhost";
-  }
-}
 /**
  * Get a hash id for a Param object from a group.
  *

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a3c82ca9/src/utils/cluster.cc
----------------------------------------------------------------------
diff --git a/src/utils/cluster.cc b/src/utils/cluster.cc
index 13729f2..a9928eb 100644
--- a/src/utils/cluster.cc
+++ b/src/utils/cluster.cc
@@ -85,7 +85,6 @@ void Cluster::Init(int job, const SingaProto& singaConf,
   // cluster_rt_ = new SPClusterRT();
   cluster_rt_ = ClusterRuntime::Create(singa_.zookeeper_host(), job);
   cluster_rt_->Init();
-  hostip_ = GetHostIP();
 }
 
 void Cluster::SetupFolders(const ClusterProto &cluster) {


Mime
View raw message