Return-Path: X-Original-To: apmail-giraph-commits-archive@www.apache.org Delivered-To: apmail-giraph-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 8FF2D18D19 for ; Thu, 17 Mar 2016 16:58:12 +0000 (UTC) Received: (qmail 42599 invoked by uid 500); 17 Mar 2016 16:58:12 -0000 Delivered-To: apmail-giraph-commits-archive@giraph.apache.org Received: (qmail 42568 invoked by uid 500); 17 Mar 2016 16:58:12 -0000 Mailing-List: contact commits-help@giraph.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@giraph.apache.org Delivered-To: mailing list commits@giraph.apache.org Received: (qmail 42558 invoked by uid 99); 17 Mar 2016 16:58:12 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 17 Mar 2016 16:58:12 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 415EFDFA0A; Thu, 17 Mar 2016 16:58:12 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: maja@apache.org To: commits@giraph.apache.org Message-Id: <9a7c5dcf6b7f4d75ae999fc77c0be5fa@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: git commit: updated refs/heads/trunk to e4aa99d Date: Thu, 17 Mar 2016 16:58:12 +0000 (UTC) Repository: giraph Updated Branches: refs/heads/trunk 5a04dc554 -> e4aa99d3f Increase info-logging while waiting for straggler workers Summary: Keep logging info messages while waiting for task-time-out Test Plan: All unit tests are passing. Manual tests to ensure desired functionality is observed. Reviewers: maja.kabiljo Subscribers: dionysis.logothetis Differential Revision: https://reviews.facebook.net/D55467 Project: http://git-wip-us.apache.org/repos/asf/giraph/repo Commit: http://git-wip-us.apache.org/repos/asf/giraph/commit/e4aa99d3 Tree: http://git-wip-us.apache.org/repos/asf/giraph/tree/e4aa99d3 Diff: http://git-wip-us.apache.org/repos/asf/giraph/diff/e4aa99d3 Branch: refs/heads/trunk Commit: e4aa99d3f603e70c7db3a55ae5d59470c1a37f58 Parents: 5a04dc5 Author: Tyler Serdar Bulut Authored: Tue Mar 15 12:10:36 2016 -0700 Committer: Maja Kabiljo Committed: Thu Mar 17 09:56:00 2016 -0700 ---------------------------------------------------------------------- .../apache/giraph/master/BspServiceMaster.java | 106 ++++++++++++------- 1 file changed, 65 insertions(+), 41 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/giraph/blob/e4aa99d3/giraph-core/src/main/java/org/apache/giraph/master/BspServiceMaster.java ---------------------------------------------------------------------- diff --git a/giraph-core/src/main/java/org/apache/giraph/master/BspServiceMaster.java b/giraph-core/src/main/java/org/apache/giraph/master/BspServiceMaster.java index cc70b17..e9ece66 100644 --- a/giraph-core/src/main/java/org/apache/giraph/master/BspServiceMaster.java +++ b/giraph-core/src/main/java/org/apache/giraph/master/BspServiceMaster.java @@ -1275,41 +1275,47 @@ public class BspServiceMaster finishedHostnameIdList; + List finishedHostnameIdList = new ArrayList<>(); long nextInfoMillis = System.currentTimeMillis(); final int defaultTaskTimeoutMsec = 10 * 60 * 1000; // from TaskTracker + final int waitBetweenLogInfoMsec = 30 * 1000; final int taskTimeoutMsec = getContext().getConfiguration().getInt( - "mapred.task.timeout", defaultTaskTimeoutMsec); + "mapred.task.timeout", defaultTaskTimeoutMsec) / 2; + long lastRegularRunTimeMsec = 0; + int eventLoopTimeout = Math.min(taskTimeoutMsec, waitBetweenLogInfoMsec); + boolean logInfoOnlyRun = false; List deadWorkers = new ArrayList<>(); while (true) { - try { - finishedHostnameIdList = - getZkExt().getChildrenExt(finishedWorkerPath, - true, - false, - false); - } catch (KeeperException e) { - throw new IllegalStateException( - "barrierOnWorkerList: KeeperException - Couldn't get " + - "children of " + finishedWorkerPath, e); - } catch (InterruptedException e) { - throw new IllegalStateException( - "barrierOnWorkerList: IllegalException - Couldn't get " + - "children of " + finishedWorkerPath, e); - } - if (LOG.isDebugEnabled()) { - LOG.debug("barrierOnWorkerList: Got finished worker list = " + - finishedHostnameIdList + ", size = " + - finishedHostnameIdList.size() + - ", worker list = " + - workerInfoList + ", size = " + - workerInfoList.size() + - " from " + finishedWorkerPath); + if (! logInfoOnlyRun) { + try { + finishedHostnameIdList = + getZkExt().getChildrenExt(finishedWorkerPath, + true, + false, + false); + } catch (KeeperException e) { + throw new IllegalStateException( + "barrierOnWorkerList: KeeperException - Couldn't get " + + "children of " + finishedWorkerPath, e); + } catch (InterruptedException e) { + throw new IllegalStateException( + "barrierOnWorkerList: IllegalException - Couldn't get " + + "children of " + finishedWorkerPath, e); + } + if (LOG.isDebugEnabled()) { + LOG.debug("barrierOnWorkerList: Got finished worker list = " + + finishedHostnameIdList + ", size = " + + finishedHostnameIdList.size() + + ", worker list = " + + workerInfoList + ", size = " + + workerInfoList.size() + + " from " + finishedWorkerPath); + } } if (LOG.isInfoEnabled() && (System.currentTimeMillis() > nextInfoMillis)) { - nextInfoMillis = System.currentTimeMillis() + 30000; + nextInfoMillis = System.currentTimeMillis() + waitBetweenLogInfoMsec; LOG.info("barrierOnWorkerList: " + finishedHostnameIdList.size() + " out of " + workerInfoList.size() + @@ -1322,29 +1328,47 @@ public class BspServiceMaster= taskTimeoutMsec) { + logInfoOnlyRun = false; + } else { + logInfoOnlyRun = true; + continue; + } + // Did a worker die? try { deadWorkers.addAll(superstepChosenWorkerAlive(