Return-Path: X-Original-To: apmail-singa-commits-archive@minotaur.apache.org Delivered-To: apmail-singa-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 2470D18BF9 for ; Wed, 27 May 2015 14:39:37 +0000 (UTC) Received: (qmail 93876 invoked by uid 500); 27 May 2015 14:39:21 -0000 Delivered-To: apmail-singa-commits-archive@singa.apache.org Received: (qmail 93859 invoked by uid 500); 27 May 2015 14:39:21 -0000 Mailing-List: contact commits-help@singa.incubator.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@singa.incubator.apache.org Delivered-To: mailing list commits@singa.incubator.apache.org Received: (qmail 93850 invoked by uid 99); 27 May 2015 14:39:21 -0000 Received: from Unknown (HELO spamd2-us-west.apache.org) (209.188.14.142) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 27 May 2015 14:39:21 +0000 Received: from localhost (localhost [127.0.0.1]) by spamd2-us-west.apache.org (ASF Mail Server at spamd2-us-west.apache.org) with ESMTP id CA00C1A356B for ; Wed, 27 May 2015 14:39:20 +0000 (UTC) X-Virus-Scanned: Debian amavisd-new at spamd2-us-west.apache.org X-Spam-Flag: NO X-Spam-Score: 0.791 X-Spam-Level: X-Spam-Status: No, score=0.791 tagged_above=-999 required=6.31 tests=[KAM_ASCII_DIVIDERS=0.8, T_RP_MATCHES_RCVD=-0.01, URIBL_BLOCKED=0.001] autolearn=disabled Received: from mx1-eu-west.apache.org ([10.40.0.8]) by localhost (spamd2-us-west.apache.org [10.40.0.9]) (amavisd-new, port 10024) with ESMTP id WrUfyklpKom6 for ; Wed, 27 May 2015 14:39:05 +0000 (UTC) Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by mx1-eu-west.apache.org (ASF Mail Server at mx1-eu-west.apache.org) with SMTP id 831BA24C28 for ; Wed, 27 May 2015 14:39:03 +0000 (UTC) Received: (qmail 93356 invoked by uid 99); 27 May 2015 14:39:03 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 27 May 2015 14:39:03 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 470BDE00DC; Wed, 27 May 2015 14:39:03 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: wangwei@apache.org To: commits@singa.incubator.apache.org Date: Wed, 27 May 2015 14:39:13 -0000 Message-Id: <2fd5da65bb894afc9dbf7a6b975bdad6@git.apache.org> In-Reply-To: <9b081f203faf4ff7953998ab95776130@git.apache.org> References: <9b081f203faf4ff7953998ab95776130@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [12/22] incubator-singa git commit: add retry for worker connecting to zk, when its subscribed server group is not up yet add retry for worker connecting to zk, when its subscribed server group is not up yet Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/96121bae Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/96121bae Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/96121bae Branch: refs/heads/master Commit: 96121bae15b4fe2dbd8541f5daf47ab91d080bb7 Parents: cd9fc79 Author: wangsheng Authored: Tue May 26 16:52:39 2015 +0800 Committer: wangsheng Committed: Tue May 26 16:52:39 2015 +0800 ---------------------------------------------------------------------- Makefile.example | 2 +- include/utils/cluster.h | 13 ------------- include/utils/cluster_rt.h | 2 ++ src/utils/cluster_rt.cc | 33 +++++++++++++++++++++------------ 4 files changed, 24 insertions(+), 26 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/96121bae/Makefile.example ---------------------------------------------------------------------- diff --git a/Makefile.example b/Makefile.example index 582e8d7..6d8d83a 100644 --- a/Makefile.example +++ b/Makefile.example @@ -51,7 +51,7 @@ OBJS := $(sort $(SINGA_OBJS) $(TEST_OBJS) ) .PHONY: singa test singa: $(PROTO_OBJS) $(SINGA_OBJS) - $(CXX) $(SINGA_OBJS) src/main.cc -o $(BUILD_DIR)/singa $(CXXFLAGS) $(LDFLAGS) + $(CXX) $(SINGA_OBJS) src/main.cc -o singa $(CXXFLAGS) $(LDFLAGS) @echo loader: proto $(LOADER_OBJS) http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/96121bae/include/utils/cluster.h ---------------------------------------------------------------------- diff --git a/include/utils/cluster.h b/include/utils/cluster.h index 563045d..d7ac365 100644 --- a/include/utils/cluster.h +++ b/include/utils/cluster.h @@ -109,19 +109,6 @@ class Cluster { } */ - //ClusterRuntime functions - bool server_watch(int gid, int sid) const { - return false; - } - - bool worker_join_sgroup(int gid, int wid, int server_group) const { - return false; - } - - bool worker_leave_sgroup(int gid, int wid, int s_group) const { - return false; - } - shared_ptr runtime() const { return cluster_rt_; } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/96121bae/include/utils/cluster_rt.h ---------------------------------------------------------------------- diff --git a/include/utils/cluster_rt.h b/include/utils/cluster_rt.h index 1430119..54a13c5 100644 --- a/include/utils/cluster_rt.h +++ b/include/utils/cluster_rt.h @@ -74,6 +74,8 @@ class ZKClusterRT : public ClusterRuntime{ vector cb_vec_; const int MAX_BUF_LEN = 50; + const int RETRY_NUM = 10; + const int SLEEP_SEC = 1; }; } // namespace singa http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/96121bae/src/utils/cluster_rt.cc ---------------------------------------------------------------------- diff --git a/src/utils/cluster_rt.cc b/src/utils/cluster_rt.cc index 6a12ca9..b97fadc 100644 --- a/src/utils/cluster_rt.cc +++ b/src/utils/cluster_rt.cc @@ -81,21 +81,30 @@ bool ZKClusterRT::wJoinSGroup(int gid, int wid, int s_group){ string path = getSGroupPath(s_group) + getWorkerPath(gid, wid); char buf[MAX_BUF_LEN]; - int ret = zoo_create(zkhandle_, path.c_str(), NULL, -1, &ZOO_OPEN_ACL_UNSAFE, ZOO_EPHEMERAL, buf, MAX_BUF_LEN); - if (ret == ZOK){ - LOG(INFO) << "zookeeper node " << buf << " created"; - return true; - } - else if (ret == ZNODEEXISTS){ - LOG(WARNING) << "zookeeper node " << path << " already exist"; - return true; - } - else if (ret == ZNONODE){ - LOG(ERROR) << "zookeeper parent node " << getSGroupPath(s_group) << " not exist"; + //try to create a file under the server group path + for (int i = 0; i < RETRY_NUM; ++i){ + //send the zk request + int ret = zoo_create(zkhandle_, path.c_str(), NULL, -1, &ZOO_OPEN_ACL_UNSAFE, ZOO_EPHEMERAL, buf, MAX_BUF_LEN); + + if (ret == ZOK){ + LOG(INFO) << "zookeeper node " << buf << " created"; + return true; + } + else if (ret == ZNODEEXISTS){ + LOG(WARNING) << "zookeeper node " << path << " already exist"; + return true; + } + //the parent node is not on, need to wait + else if (ret == ZNONODE){ + LOG(WARNING) << "zookeeper parent node " << getSGroupPath(s_group) << " not exist, retry later"; + sleep(SLEEP_SEC); + } + + LOG(ERROR) << "Unhandled ZK error code: " << ret << " (zoo_create)"; return false; } - LOG(ERROR) << "Unhandled ZK error code: " << ret << " (zoo_create)"; + LOG(ERROR) << "zookeeper parent node " << getSGroupPath(s_group) << "still not exist after " << RETRY_NUM << " tries"; return false; }