Return-Path: X-Original-To: apmail-tez-commits-archive@minotaur.apache.org Delivered-To: apmail-tez-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id DCBCCC5F7 for ; Wed, 12 Nov 2014 19:08:27 +0000 (UTC) Received: (qmail 20748 invoked by uid 500); 12 Nov 2014 19:08:27 -0000 Delivered-To: apmail-tez-commits-archive@tez.apache.org Received: (qmail 20710 invoked by uid 500); 12 Nov 2014 19:08:27 -0000 Mailing-List: contact commits-help@tez.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@tez.apache.org Delivered-To: mailing list commits@tez.apache.org Received: (qmail 20701 invoked by uid 99); 12 Nov 2014 19:08:27 -0000 Received: from tyr.zones.apache.org (HELO tyr.zones.apache.org) (140.211.11.114) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 12 Nov 2014 19:08:27 +0000 Received: by tyr.zones.apache.org (Postfix, from userid 65534) id 73C3DA10CD4; Wed, 12 Nov 2014 19:08:27 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: sseth@apache.org To: commits@tez.apache.org Message-Id: X-Mailer: ASF-Git Admin Mailer Subject: tez git commit: TEZ-1770. Handle ConnectExceptions correctly when establishing connections to an NM which may be down. (sseth) Date: Wed, 12 Nov 2014 19:08:27 +0000 (UTC) Repository: tez Updated Branches: refs/heads/master 0ebfc1b10 -> 0cceb1f22 TEZ-1770. Handle ConnectExceptions correctly when establishing connections to an NM which may be down. (sseth) Project: http://git-wip-us.apache.org/repos/asf/tez/repo Commit: http://git-wip-us.apache.org/repos/asf/tez/commit/0cceb1f2 Tree: http://git-wip-us.apache.org/repos/asf/tez/tree/0cceb1f2 Diff: http://git-wip-us.apache.org/repos/asf/tez/diff/0cceb1f2 Branch: refs/heads/master Commit: 0cceb1f220632e7722930315c03ca8c44c381e68 Parents: 0ebfc1b Author: Siddharth Seth Authored: Wed Nov 12 11:08:12 2014 -0800 Committer: Siddharth Seth Committed: Wed Nov 12 11:08:12 2014 -0800 ---------------------------------------------------------------------- CHANGES.txt | 1 + .../library/common/shuffle/HttpConnection.java | 23 ++++++++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tez/blob/0cceb1f2/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index 7ad6903..80263bf 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -23,6 +23,7 @@ ALL CHANGES: TEZ-1761. TestRecoveryParser::testGetLastInProgressDAG fails in similar manner to TEZ-1686. TEZ-1687. Use logIdentifier of Vertex for logging. TEZ-1737. Should add taskNum in VertexFinishedEvent. + TEZ-1770. Handle ConnectExceptions correctly when establishing connections to an NM which may be down. Release 0.5.2: 2014-11-07 http://git-wip-us.apache.org/repos/asf/tez/blob/0cceb1f2/tez-runtime-library/src/main/java/org/apache/tez/runtime/library/common/shuffle/HttpConnection.java ---------------------------------------------------------------------- diff --git a/tez-runtime-library/src/main/java/org/apache/tez/runtime/library/common/shuffle/HttpConnection.java b/tez-runtime-library/src/main/java/org/apache/tez/runtime/library/common/shuffle/HttpConnection.java index 6e33993..4732a5a 100644 --- a/tez-runtime-library/src/main/java/org/apache/tez/runtime/library/common/shuffle/HttpConnection.java +++ b/tez-runtime-library/src/main/java/org/apache/tez/runtime/library/common/shuffle/HttpConnection.java @@ -156,12 +156,14 @@ public class HttpConnection { connection.setConnectTimeout(unit); int connectionFailures = 0; while (true) { + long connectStartTime = System.currentTimeMillis(); try { connection.connect(); connectionSucceeed = true; break; } catch (IOException ioe) { // Don't attempt another connect if already cleanedup. + connectionFailures++; if (cleanup) { LOG.info("Cleanup is set to true. Not attempting to" + " connect again. Last exception was: [" @@ -173,15 +175,32 @@ public class HttpConnection { // throw an exception if we have waited for timeout amount of time // note that the updated value if timeout is used here if (connectionTimeout <= 0) { - throw ioe; + throw new IOException( + "Failed to connect to " + url + ", #connectionFailures=" + connectionFailures, ioe); + } + long elapsed = System.currentTimeMillis() - connectStartTime; + if (elapsed < unit) { + try { + long sleepTime = unit - elapsed; + if (LOG.isDebugEnabled()) { + LOG.debug("Sleeping for " + sleepTime + " while establishing connection to " + url + + ", since connectAttempt returned in " + elapsed + " ms"); + } + Thread.sleep(sleepTime); + } catch (InterruptedException e) { + throw new IOException( + "Connection establishment sleep interrupted, #connectionFailures=" + + connectionFailures, e); + } } + // reset the connect timeout for the last try if (connectionTimeout < unit) { unit = connectionTimeout; // reset the connect time out for the final connect connection.setConnectTimeout(unit); } - connectionFailures++; + } } if (LOG.isDebugEnabled()) {