singa-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From wang...@apache.org
Subject [12/22] incubator-singa git commit: add retry for worker connecting to zk, when its subscribed server group is not up yet
Date Wed, 27 May 2015 14:39:13 GMT
add retry for worker connecting to zk, when its subscribed server group is not up yet


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/96121bae
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/96121bae
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/96121bae

Branch: refs/heads/master
Commit: 96121bae15b4fe2dbd8541f5daf47ab91d080bb7
Parents: cd9fc79
Author: wangsheng <wangsheng1001@gmail.com>
Authored: Tue May 26 16:52:39 2015 +0800
Committer: wangsheng <wangsheng1001@gmail.com>
Committed: Tue May 26 16:52:39 2015 +0800

----------------------------------------------------------------------
 Makefile.example           |  2 +-
 include/utils/cluster.h    | 13 -------------
 include/utils/cluster_rt.h |  2 ++
 src/utils/cluster_rt.cc    | 33 +++++++++++++++++++++------------
 4 files changed, 24 insertions(+), 26 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/96121bae/Makefile.example
----------------------------------------------------------------------
diff --git a/Makefile.example b/Makefile.example
index 582e8d7..6d8d83a 100644
--- a/Makefile.example
+++ b/Makefile.example
@@ -51,7 +51,7 @@ OBJS := $(sort $(SINGA_OBJS) $(TEST_OBJS) )
 .PHONY: singa test
 
 singa: $(PROTO_OBJS) $(SINGA_OBJS)
-	$(CXX) $(SINGA_OBJS) src/main.cc -o $(BUILD_DIR)/singa $(CXXFLAGS) $(LDFLAGS)
+	$(CXX) $(SINGA_OBJS) src/main.cc -o singa $(CXXFLAGS) $(LDFLAGS)
 	@echo
 
 loader: proto $(LOADER_OBJS)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/96121bae/include/utils/cluster.h
----------------------------------------------------------------------
diff --git a/include/utils/cluster.h b/include/utils/cluster.h
index 563045d..d7ac365 100644
--- a/include/utils/cluster.h
+++ b/include/utils/cluster.h
@@ -109,19 +109,6 @@ class Cluster {
   }
    */
 
-  //ClusterRuntime functions
-  bool server_watch(int gid, int sid) const {
-    return false;
-  }
-
-  bool worker_join_sgroup(int gid, int wid, int server_group) const {
-    return false;
-  }
-
-  bool worker_leave_sgroup(int gid, int wid, int s_group) const {
-    return false;
-  }
-
   shared_ptr<ClusterRuntime> runtime() const {
     return cluster_rt_;
   }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/96121bae/include/utils/cluster_rt.h
----------------------------------------------------------------------
diff --git a/include/utils/cluster_rt.h b/include/utils/cluster_rt.h
index 1430119..54a13c5 100644
--- a/include/utils/cluster_rt.h
+++ b/include/utils/cluster_rt.h
@@ -74,6 +74,8 @@ class ZKClusterRT : public ClusterRuntime{
   vector<RTCallback *> cb_vec_;
     
   const int MAX_BUF_LEN = 50;
+  const int RETRY_NUM = 10;
+  const int SLEEP_SEC = 1;
 };
 
 } // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/96121bae/src/utils/cluster_rt.cc
----------------------------------------------------------------------
diff --git a/src/utils/cluster_rt.cc b/src/utils/cluster_rt.cc
index 6a12ca9..b97fadc 100644
--- a/src/utils/cluster_rt.cc
+++ b/src/utils/cluster_rt.cc
@@ -81,21 +81,30 @@ bool ZKClusterRT::wJoinSGroup(int gid, int wid, int s_group){
   string path = getSGroupPath(s_group) + getWorkerPath(gid, wid);
   char buf[MAX_BUF_LEN];
 
-  int ret = zoo_create(zkhandle_, path.c_str(), NULL, -1, &ZOO_OPEN_ACL_UNSAFE, ZOO_EPHEMERAL,
buf, MAX_BUF_LEN);
-  if (ret == ZOK){
-    LOG(INFO) << "zookeeper node " << buf << " created";
-    return true;
-  }
-  else if (ret == ZNODEEXISTS){
-    LOG(WARNING) << "zookeeper node " << path << " already exist";
-    return true;
-  }
-  else if (ret == ZNONODE){
-    LOG(ERROR) << "zookeeper parent node " << getSGroupPath(s_group) <<
" not exist";
+  //try to create a file under the server group path
+  for (int i = 0; i < RETRY_NUM; ++i){
+    //send the zk request
+    int ret = zoo_create(zkhandle_, path.c_str(), NULL, -1, &ZOO_OPEN_ACL_UNSAFE, ZOO_EPHEMERAL,
buf, MAX_BUF_LEN);
+
+    if (ret == ZOK){
+      LOG(INFO) << "zookeeper node " << buf << " created";
+      return true;
+    }
+    else if (ret == ZNODEEXISTS){
+      LOG(WARNING) << "zookeeper node " << path << " already exist";
+      return true;
+    }
+    //the parent node is not on, need to wait
+    else if (ret == ZNONODE){
+      LOG(WARNING) << "zookeeper parent node " << getSGroupPath(s_group) <<
" not exist, retry later";
+      sleep(SLEEP_SEC);
+    }
+    
+    LOG(ERROR) << "Unhandled ZK error code: " << ret << " (zoo_create)";
     return false;
   }
 
-  LOG(ERROR) << "Unhandled ZK error code: " << ret << " (zoo_create)";
+  LOG(ERROR) << "zookeeper parent node " << getSGroupPath(s_group) << "still
not exist after " << RETRY_NUM << " tries";
   return false;
 }
 


Mime
View raw message