singa-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From wang...@apache.org
Subject incubator-singa git commit: SINGA-43 Remove Job-related output from workspace
Date Thu, 13 Aug 2015 05:32:52 GMT
Repository: incubator-singa
Updated Branches:
  refs/heads/master 7a61a687c -> 2c7edd73c


SINGA-43 Remove Job-related output from workspace

singa-run script now only take a job.conf as input (instead of workspace)
users are required to set a workspace in their job.conf

all job information are recorded in the log_dir, which is set in singa.conf
the dir structure is as follows:
  log_dir/job-info/job-ID-YYYYmmdd-HHMMSS/job.hosts     host list
                                         /job.pids      pid list


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/2c7edd73
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/2c7edd73
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/2c7edd73

Branch: refs/heads/master
Commit: 2c7edd73c0636f2085247ea29300fc1913ee05d5
Parents: 7a61a68
Author: wang sheng <wangsheng1001@gmail.com>
Authored: Thu Aug 13 11:25:12 2015 +0800
Committer: wang sheng <wangsheng1001@gmail.com>
Committed: Thu Aug 13 12:43:38 2015 +0800

----------------------------------------------------------------------
 bin/singa-cleanup.sh       |   2 +-
 bin/singa-console.sh       |  14 ++--
 bin/singa-env.sh           |  13 ++++
 bin/singa-run.sh           |  46 +++++++-----
 bin/singa-stop.sh          |   2 +-
 examples/cifar10/job.conf  |   1 +
 include/utils/cluster_rt.h |   4 +-
 src/main.cc                |   6 +-
 src/proto/job.proto        |   2 +-
 src/proto/singa.proto      |   4 +-
 src/utils/cluster_rt.cc    |   7 +-
 src/utils/tool.cc          | 159 ++++++++++++++++++++++++----------------
 12 files changed, 156 insertions(+), 104 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2c7edd73/bin/singa-cleanup.sh
----------------------------------------------------------------------
diff --git a/bin/singa-cleanup.sh b/bin/singa-cleanup.sh
index 9b542c6..c987ca4 100755
--- a/bin/singa-cleanup.sh
+++ b/bin/singa-cleanup.sh
@@ -31,5 +31,5 @@ $SINGA_BIN/singa-stop.sh || exit 1
 
 # close zookeeper
 if [ $SINGA_MANAGES_ZK = true ]; then
-  $SINGA_BIN/zk-service.sh stop
+  $SINGA_BIN/zk-service.sh stop || exit 1
 fi

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2c7edd73/bin/singa-console.sh
----------------------------------------------------------------------
diff --git a/bin/singa-console.sh b/bin/singa-console.sh
index 8f7cac1..36913ce 100755
--- a/bin/singa-console.sh
+++ b/bin/singa-console.sh
@@ -23,10 +23,10 @@
 # console to list/view/kill singa jobs
 #
 
-usage="Usage:\n
-       # singa-console.sh list         :  list running singa jobs\n
-       # singa-console.sh view JOB_ID  :  view procs of a singa job\n
-       # singa-console.sh kill JOB_ID  :  kill a singa job"
+usage="Usage: singa-console.sh <command> <args>\n
+        list         :  list running singa jobs\n
+        view JOB_ID  :  view procs of a singa job\n
+        kill JOB_ID  :  kill a singa job"
 
 if [ $# == 0 ]; then
   echo -e $usage
@@ -59,12 +59,11 @@ case $1 in
       echo -e $usage
       exit 1
     fi
-    host_file="job-$2.tmp"
-    ./singatool view $2 1>$host_file || exit 1
+    hosts=`./singatool view "$2"`
+    [ $? == 0 ] || exit 1
     ssh_options="-oStrictHostKeyChecking=no \
              -oUserKnownHostsFile=/dev/null \
              -oLogLevel=quiet"
-    hosts=`cat $host_file | cut -d ' ' -f 1`
     if [ `head -1 "$SINGA_CONF"/hostfile` == localhost ]; then
       local_procs=1
     fi
@@ -79,7 +78,6 @@ case $1 in
         $singa_kill
       fi
     done
-    rm $host_file
     ./singatool clean $2 || exit 1
     ;;
   

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2c7edd73/bin/singa-env.sh
----------------------------------------------------------------------
diff --git a/bin/singa-env.sh b/bin/singa-env.sh
index 10578b8..c9d42bd 100755
--- a/bin/singa-env.sh
+++ b/bin/singa-env.sh
@@ -24,10 +24,14 @@
 #   * SINGA_HOME
 #   * SINGA_BIN
 #   * SINGA_CONF
+#   * SINGA_LOG
 #   * ZK_HOME
 #   * SINGA_MANAGES_ZK
 #
 
+# exit if varaiables already set
+[ -z $SINGA_ENV_DONE ] || exit 0
+
 # set SINGA_BIN
 if [ -z $SINGA_BIN ]; then
   SINGA_BIN=`dirname "${BASH_SOURCE-$0}"`
@@ -44,6 +48,13 @@ if [ -z $SINGA_CONF ]; then
   SINGA_CONF=$SINGA_HOME/conf
 fi
 
+# set SINGA_LOG
+if [ -z $SINGA_LOG ]; then
+  # add -global arg, so no need to run under SINGA_HOME
+  SINGA_LOG=`"$SINGA_HOME"/singatool getlogdir -global="$SINGA_CONF"/singa.conf`
+  [ $? == 0 ] || exit 1 
+fi
+
 # set ZK_HOME
 if [ -z $ZK_HOME ]; then
   ZK_HOME=$SINGA_HOME/thirdparty/zookeeper-3.4.6
@@ -55,3 +66,5 @@ if [ -z $SINGA_MANAGES_ZK ]; then
   SINGA_MANAGES_ZK=false
 fi
 
+# mark that we have done all
+SINGA_ENV_DONE=1

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2c7edd73/bin/singa-run.sh
----------------------------------------------------------------------
diff --git a/bin/singa-run.sh b/bin/singa-run.sh
index d434331..aa65fd9 100755
--- a/bin/singa-run.sh
+++ b/bin/singa-run.sh
@@ -23,8 +23,7 @@
 # run a Singa job
 #
 
-usage="Usage: singa-run.sh -workspace=YOUR_WORKSPACE [ --resume ]\n
-       # workspace should contain job.conf\n
+usage="Usage: singa-run.sh -conf=JOB_CONF [ --resume ]\n
        # set --resume if want to recover a job\n
        ### NOTICE ###\n
        # if you are using model.conf + cluster.conf,\n
@@ -33,28 +32,30 @@ usage="Usage: singa-run.sh -workspace=YOUR_WORKSPACE [ --resume ]\n
 
 # check arguments
 while [ $# != 0 ]; do
-  if [[ $1 == "-workspace="* ]]; then
-    workspace=$1
+  if [[ $1 == "-conf="* ]]; then
+    conf=$1
   elif [ $1 == "--resume" ]; then
     resume=1
   else
-    echo -e $usage
-    exit 1
+    echo -e $usage && exit 1
   fi
   shift
 done
-if [ -z $workspace ]; then
+if [ -z $conf ]; then
   echo -e $usage
   exit 1
 fi
 
 # get environment variables
 . `dirname "${BASH_SOURCE-$0}"`/singa-env.sh
-# get workspace path
-workspace=`cd "${workspace:11}">/dev/null; pwd`
-job_conf=$workspace/job.conf
+
+# change conf to an absolute path
+conf_dir=`dirname "${conf:6}"`
+conf_dir=`cd "$conf_dir">/dev/null; pwd`
+conf_base=`basename "${conf:6}"`
+job_conf=$conf_dir/$conf_base
 if [ ! -f $job_conf ]; then
-  echo job.conf not exists in $workspace
+  echo $job_conf not exists
   exit 1
 fi
 cd $SINGA_HOME
@@ -64,20 +65,26 @@ if [ $SINGA_MANAGES_ZK = true ]; then
   $SINGA_BIN/zk-service.sh start || exit 1
 fi
 
+# generate unique job id
+job_id=`./singatool create`
+[ $? == 0 ] || exit 1
+echo Unique JOB_ID is $job_id
+
+# generate job info dir
+# format: job-JOB_ID-YYYYMMDD-HHMMSS
+log_dir=$SINGA_LOG/job-info/job-$job_id-$(date '+%Y%m%d-%H%M%S');
+mkdir -p $log_dir
+echo Record job information to $log_dir
+
 # generate host file
-host_file=$workspace/job.hosts
+host_file=$log_dir/job.hosts
 python $SINGA_HOME/tool/gen_hosts.py -conf=$job_conf \
                                      -hosts=$SINGA_CONF/hostfile \
                                      -output=$host_file \
                                      || exit 1
 
-# generate unique job id
-./singatool create 1>$workspace/job.id || exit 1
-job_id=`cat $workspace/job.id`
-echo Generate job id to $workspace/job.id [job_id = $job_id]
-
 # set command to run singa
-singa_run="./singa -workspace=$workspace -job=$job_id"
+singa_run="./singa -conf=$job_conf -job=$job_id"
 if [ ! -z $resume ]; then
   singa_run="$singa_run --resume"
 fi
@@ -100,6 +107,5 @@ done
 
 # generate pid list for this job
 sleep 2
-./singatool view $job_id 1>$workspace/job.pids || exit
-echo Generate pid list to $workspace/job.pids
+./singatool view $job_id 1>$log_dir/job.pids || exit 1
 wait

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2c7edd73/bin/singa-stop.sh
----------------------------------------------------------------------
diff --git a/bin/singa-stop.sh b/bin/singa-stop.sh
index ff67f32..115f3fb 100755
--- a/bin/singa-stop.sh
+++ b/bin/singa-stop.sh
@@ -33,7 +33,7 @@ ssh_options="-oStrictHostKeyChecking=no \
              -oUserKnownHostsFile=/dev/null \
              -oLogLevel=quiet"
 hosts=`cat $host_file | cut -d ' ' -f 1`
-singa_kill="killall -s SIGKILL -r singa"
+singa_kill="killall -q -s SIGKILL -r singa"
 for i in ${hosts[@]}; do
   echo Kill singa @ $i ...
   if [ $i == localhost ]; then

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2c7edd73/examples/cifar10/job.conf
----------------------------------------------------------------------
diff --git a/examples/cifar10/job.conf b/examples/cifar10/job.conf
index 2541330..f7829b8 100644
--- a/examples/cifar10/job.conf
+++ b/examples/cifar10/job.conf
@@ -1,6 +1,7 @@
 cluster {
   nworker_groups: 1
   nserver_groups: 1
+  workspace: "examples/cifar10"
 }
 
 model {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2c7edd73/include/utils/cluster_rt.h
----------------------------------------------------------------------
diff --git a/include/utils/cluster_rt.h b/include/utils/cluster_rt.h
index 5738ae3..90f60cd 100644
--- a/include/utils/cluster_rt.h
+++ b/include/utils/cluster_rt.h
@@ -93,7 +93,7 @@ class ZKService {
                    RTCallback *cb);
 
  private:
-  const int kNumRetry = 10;
+  const int kNumRetry = 5;
   const int kSleepSec = 1;
 
   static void WatcherGlobal(zhandle_t* zh, int type, int state,
@@ -139,7 +139,7 @@ class JobManager {
   JobManager(const std::string& host, int timeout);
 
   bool Init();
-  int GenerateJobID();
+  bool GenerateJobID(int* id);
   bool ListJobs(std::vector<JobInfo>* jobs);
   bool ListJobProcs(int job, std::vector<std::string>* procs);
   bool Clean(int job);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2c7edd73/src/main.cc
----------------------------------------------------------------------
diff --git a/src/main.cc b/src/main.cc
index 87ab384..d95e405 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -13,7 +13,7 @@
 
 DEFINE_int32(job, -1, "Unique job ID generated from singa-run.sh");
 DEFINE_bool(resume, false, "Resume from checkpoint passed at cmd line");
-DEFINE_string(workspace, "./workspace", "workspace passed at cmd line");
+DEFINE_string(conf, "./job.conf", "job conf passed at cmd line");
 
 /**
  * Register layers, and other customizable classes.
@@ -31,12 +31,10 @@ int main(int argc, char **argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
   singa::JobProto jobConf;
-  std::string job_file = FLAGS_workspace + "/job.conf";
+  std::string job_file = FLAGS_conf;
   singa::ReadProtoFromTextFile(job_file.c_str(), &jobConf);
   CHECK(jobConf.has_cluster());
   CHECK(jobConf.has_model());
-  if (!jobConf.cluster().has_workspace())
-    jobConf.mutable_cluster()->set_workspace(FLAGS_workspace);
 
   RegisterClasses();
   singa::SubmitJob(FLAGS_job, FLAGS_resume, jobConf);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2c7edd73/src/proto/job.proto
----------------------------------------------------------------------
diff --git a/src/proto/job.proto b/src/proto/job.proto
index 3b22470..eacf7e0 100644
--- a/src/proto/job.proto
+++ b/src/proto/job.proto
@@ -19,7 +19,7 @@ message ClusterProto {
   // port number is used by ZeroMQ
   optional int32 start_port = 13 [default = 6723];
   // local workspace, train/val/test shards, checkpoint files
-  optional string workspace = 14 [default = "workspace"];
+  required string workspace = 14;
 
   // conduct updates at server side; otherwise do it at worker side
   optional bool server_update = 40 [default = true];

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2c7edd73/src/proto/singa.proto
----------------------------------------------------------------------
diff --git a/src/proto/singa.proto b/src/proto/singa.proto
index 94af58d..193c8b7 100644
--- a/src/proto/singa.proto
+++ b/src/proto/singa.proto
@@ -3,6 +3,6 @@ package singa;
 message SingaProto {
   // ip/hostname:port[,ip/hostname:port]
   required string zookeeper_host = 1;
-  // if not set, use the default dir of glog
-  optional string log_dir = 2;
+  // log dir for singa binary and job information(job id, host list, pid list)
+  optional string log_dir = 2 [default = "/tmp/singa-log/"];
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2c7edd73/src/utils/cluster_rt.cc
----------------------------------------------------------------------
diff --git a/src/utils/cluster_rt.cc b/src/utils/cluster_rt.cc
index 0458b12..cd11bbd 100644
--- a/src/utils/cluster_rt.cc
+++ b/src/utils/cluster_rt.cc
@@ -297,14 +297,15 @@ bool JobManager::Init() {
   return true;
 }
 
-int JobManager::GenerateJobID() {
+bool JobManager::GenerateJobID(int* id) {
   char buf[kZKBufSize];
   string lock = kZKPathJLock + "/lock-";
   if (!zk_.CreateNode(lock.c_str(), nullptr,
                         ZOO_EPHEMERAL | ZOO_SEQUENCE, buf)) {
-    return -1;
+    return false;
   }
-  return atoi(buf+strlen(buf)-10);
+  *id = atoi(buf+strlen(buf)-10);
+  return true;
 }
 
 bool JobManager::ListJobProcs(int job, vector<string>* procs) {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2c7edd73/src/utils/tool.cc
----------------------------------------------------------------------
diff --git a/src/utils/tool.cc b/src/utils/tool.cc
index 60da414..c450b72 100644
--- a/src/utils/tool.cc
+++ b/src/utils/tool.cc
@@ -11,74 +11,109 @@ namespace gflags = google;
 
 DEFINE_string(global, "conf/singa.conf", "Global config file");
 
+singa::SingaProto global;
+const int SUCCESS = 0;
+const int ARG_ERR = 1;
+const int RUN_ERR = 2;
+
+// generate a unique job id
+int create() {
+  singa::JobManager mngr(global.zookeeper_host());
+  if (!mngr.Init()) return RUN_ERR;
+  int id;
+  if (!mngr.GenerateJobID(&id)) return RUN_ERR;
+  printf("%d\n", id);
+  return SUCCESS;
+}
+
+// list singa jobs (running or all)
+int list(bool all) {
+  singa::JobManager mngr(global.zookeeper_host());
+  if (!mngr.Init()) return RUN_ERR;
+  std::vector<singa::JobInfo> jobs;
+  if (!mngr.ListJobs(&jobs)) return RUN_ERR;
+  printf("JOB ID    |NUM PROCS  \n");
+  printf("----------|-----------\n");
+  for (singa::JobInfo job : jobs) {
+    if (!job.procs && !all) continue;
+    printf("%-10d|%-10d\n", job.id, job.procs);
+  }
+  return SUCCESS;
+}
+
+// view procs of a singa job
+int view(int id) {
+  singa::JobManager mngr(global.zookeeper_host());
+  if (!mngr.Init()) return RUN_ERR;
+  std::vector<std::string> procs;
+  if (!mngr.ListJobProcs(id, &procs)) return RUN_ERR;
+  for (std::string s : procs) {
+    printf("%s\n", s.c_str());
+  }
+  return SUCCESS;
+}
+
+// clean a job path in zookeeper
+int clean(int id) {
+  singa::JobManager mngr(global.zookeeper_host());
+  if (!mngr.Init()) return RUN_ERR;
+  if (!mngr.Clean(id)) return RUN_ERR;
+  return SUCCESS;
+}
+
+// clean all singa data in zookeeper
+int cleanup() {
+  singa::JobManager mngr(global.zookeeper_host());
+  if (!mngr.Init()) return RUN_ERR;
+  if (!mngr.Cleanup()) return RUN_ERR;
+  return SUCCESS;
+}
+
+// show log dir in global config
+int getlogdir() {
+  std::string dir = global.log_dir();
+  while (dir.length() > 1 && dir[dir.length()-1] == '/') dir.pop_back();
+  printf("%s\n", dir.c_str());
+  return SUCCESS;
+}
+
 int main(int argc, char **argv) {
-  google::InitGoogleLogging(argv[0]);
+  std::string usage = "usage: singatool <command> <args>\n"
+      " getlogdir    :  show log dir in global config\n"
+      " create       :  generate a unique job id\n"
+      " list         :  list running singa jobs\n"
+      " listall      :  list all singa jobs\n"
+      " view JOB_ID  :  view procs of a singa job\n"
+      " clean JOB_ID :  clean a job path in zookeeper\n"
+      " cleanup      :  clean all singa data in zookeeper\n";
   // set logging level to ERROR and log to STDERR
   FLAGS_logtostderr = 1;
   FLAGS_minloglevel = 2;
+  google::InitGoogleLogging(argv[0]);
   gflags::ParseCommandLineFlags(&argc, &argv, true);
-
-  singa::SingaProto global;
   singa::ReadProtoFromTextFile(FLAGS_global.c_str(), &global);
-  LOG(INFO) << "The global config is \n" << global.DebugString();
 
-  singa::JobManager mng(global.zookeeper_host());
-  std::string usage = "singatool usage:\n"
-      "# ./singatool create       :  generate a unique job id\n"
-      "# ./singatool list         :  list running singa jobs\n"
-      "# ./singatool view JOB_ID  :  view procs of a singa job\n"
-      "# ./singatool clean JOB_ID :  clean a job path in zookeeper\n"
-      "# ./singatool cleanup      :  clean all singa data in zookeeper\n"
-      "# ./singatool listall      :  list all singa jobs\n";
-  if (argc <= 1) {
-    LOG(ERROR) << usage;
-    return 1;
+  // stat code: ARG_ERR for wrong argument, RUN_ERR for runtime error
+  int stat = SUCCESS;
+  if (argc <= 1) stat = ARG_ERR;
+  else {
+    if (!strcmp(argv[1], "create"))
+      stat = create();
+    else if (!strcmp(argv[1], "list"))
+      stat = list(false);
+    else if (!strcmp(argv[1], "listall"))
+      stat = list(true);
+    else if (!strcmp(argv[1], "view"))
+      stat = (argc > 2) ? view(atoi(argv[2])) : ARG_ERR;
+    else if (!strcmp(argv[1], "clean"))
+      stat = (argc > 2) ? clean(atoi(argv[2])) : ARG_ERR;
+    else if (!strcmp(argv[1], "cleanup"))
+      stat = cleanup();
+    else if (!strcmp(argv[1], "getlogdir"))
+      stat = getlogdir();
+    else stat = ARG_ERR;
   }
-  if (!mng.Init()) return 1;
-  if (!strcmp(argv[1], "create")) {
-    int id = mng.GenerateJobID();
-    printf("%d\n", id);
-  } else if (!strcmp(argv[1], "list")) {
-    std::vector<singa::JobInfo> jobs;
-    if (!mng.ListJobs(&jobs)) return 1;
-    printf("JOB ID    |NUM PROCS  \n");
-    printf("----------|-----------\n");
-    for (singa::JobInfo job : jobs) {
-      if (!job.procs) continue;
-      printf("%-10d|%-10d\n", job.id, job.procs);
-    }
-  } else if (!strcmp(argv[1], "listall")) {
-    std::vector<singa::JobInfo> jobs;
-    if (!mng.ListJobs(&jobs)) return 1;
-    printf("JOB ID    |NUM PROCS  \n");
-    printf("----------|-----------\n");
-    for (singa::JobInfo job : jobs) {
-      printf("%-10d|%-10d\n", job.id, job.procs);
-    }
-  } else if (!strcmp(argv[1], "view")) {
-    if (argc <= 2) {
-      LOG(ERROR) << usage;
-      return 1;
-    }
-    int id = atoi(argv[2]);
-    std::vector<std::string> procs;
-    if (!mng.ListJobProcs(id, &procs)) return 1;
-    for (std::string s : procs) {
-      printf("%s\n", s.c_str());
-    }
-  } else if (!strcmp(argv[1], "clean")) {
-    if (argc <= 2) {
-      LOG(ERROR) << usage;
-      return 1;
-    }
-    int id = atoi(argv[2]);
-    if (!mng.Clean(id)) return 1;
-  } else if (!strcmp(argv[1], "cleanup")) {
-    if (!mng.Cleanup()) return 1;
-  } else {
-    LOG(ERROR) << usage;
-    return 1;
-  }
-
-  return 0;
+  
+  if (stat == ARG_ERR) LOG(ERROR) << usage;
+  return stat;
 }


Mime
View raw message