Return-Path: X-Original-To: apmail-hadoop-common-commits-archive@www.apache.org Delivered-To: apmail-hadoop-common-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id C52841862D for ; Wed, 30 Sep 2015 15:41:24 +0000 (UTC) Received: (qmail 87048 invoked by uid 500); 30 Sep 2015 15:41:02 -0000 Delivered-To: apmail-hadoop-common-commits-archive@hadoop.apache.org Received: (qmail 86882 invoked by uid 500); 30 Sep 2015 15:41:02 -0000 Mailing-List: contact common-commits-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: common-dev@hadoop.apache.org Delivered-To: mailing list common-commits@hadoop.apache.org Received: (qmail 85491 invoked by uid 99); 30 Sep 2015 15:41:01 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 30 Sep 2015 15:41:01 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id ABD2FE0AC0; Wed, 30 Sep 2015 15:41:01 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: zhz@apache.org To: common-commits@hadoop.apache.org Date: Wed, 30 Sep 2015 15:41:22 -0000 Message-Id: In-Reply-To: References: X-Mailer: ASF-Git Admin Mailer Subject: [22/58] [abbrv] hadoop git commit: HDFS-9106. Transfer failure during pipeline recovery causes permanent write failures. Contributed by Kihwal Lee. HDFS-9106. Transfer failure during pipeline recovery causes permanent write failures. Contributed by Kihwal Lee. Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/4c9497cb Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/4c9497cb Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/4c9497cb Branch: refs/heads/HDFS-7285 Commit: 4c9497cbf02ecc82532a4e79e18912d8e0eb4731 Parents: fb2e525 Author: Kihwal Lee Authored: Mon Sep 28 13:29:19 2015 -0500 Committer: Kihwal Lee Committed: Mon Sep 28 13:29:56 2015 -0500 ---------------------------------------------------------------------- .../org/apache/hadoop/hdfs/DataStreamer.java | 60 ++++++++++++++------ hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 + 2 files changed, 47 insertions(+), 16 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/4c9497cb/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DataStreamer.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DataStreamer.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DataStreamer.java index 6482966..d1d8d37 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DataStreamer.java +++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DataStreamer.java @@ -1208,22 +1208,46 @@ class DataStreamer extends Daemon { return; } - //get a new datanode + int tried = 0; final DatanodeInfo[] original = nodes; - final LocatedBlock lb = dfsClient.namenode.getAdditionalDatanode( - src, stat.getFileId(), block, nodes, storageIDs, - failed.toArray(new DatanodeInfo[failed.size()]), - 1, dfsClient.clientName); - setPipeline(lb); - - //find the new datanode - final int d = findNewDatanode(original); - - //transfer replica - final DatanodeInfo src = d == 0? nodes[1]: nodes[d - 1]; - final DatanodeInfo[] targets = {nodes[d]}; - final StorageType[] targetStorageTypes = {storageTypes[d]}; - transfer(src, targets, targetStorageTypes, lb.getBlockToken()); + final StorageType[] originalTypes = storageTypes; + final String[] originalIDs = storageIDs; + IOException caughtException = null; + ArrayList exclude = new ArrayList(failed); + while (tried < 3) { + LocatedBlock lb; + //get a new datanode + lb = dfsClient.namenode.getAdditionalDatanode( + src, stat.getFileId(), block, nodes, storageIDs, + exclude.toArray(new DatanodeInfo[exclude.size()]), + 1, dfsClient.clientName); + // a new node was allocated by the namenode. Update nodes. + setPipeline(lb); + + //find the new datanode + final int d = findNewDatanode(original); + //transfer replica. pick a source from the original nodes + final DatanodeInfo src = original[tried % original.length]; + final DatanodeInfo[] targets = {nodes[d]}; + final StorageType[] targetStorageTypes = {storageTypes[d]}; + + try { + transfer(src, targets, targetStorageTypes, lb.getBlockToken()); + } catch (IOException ioe) { + DFSClient.LOG.warn("Error transferring data from " + src + " to " + + nodes[d] + ": " + ioe.getMessage()); + caughtException = ioe; + // add the allocated node to the exclude list. + exclude.add(nodes[d]); + setPipeline(original, originalTypes, originalIDs); + tried++; + continue; + } + return; // finished successfully + } + // All retries failed + throw (caughtException != null) ? caughtException : + new IOException("Failed to add a node"); } private void transfer(final DatanodeInfo src, final DatanodeInfo[] targets, @@ -1236,7 +1260,11 @@ class DataStreamer extends Daemon { try { sock = createSocketForPipeline(src, 2, dfsClient); final long writeTimeout = dfsClient.getDatanodeWriteTimeout(2); - final long readTimeout = dfsClient.getDatanodeReadTimeout(2); + + // transfer timeout multiplier based on the transfer size + // One per 200 packets = 12.8MB. Minimum is 2. + int multi = 2 + (int)(bytesSent/dfsClient.getConf().getWritePacketSize())/200; + final long readTimeout = dfsClient.getDatanodeReadTimeout(multi); OutputStream unbufOut = NetUtils.getOutputStream(sock, writeTimeout); InputStream unbufIn = NetUtils.getInputStream(sock, readTimeout); http://git-wip-us.apache.org/repos/asf/hadoop/blob/4c9497cb/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 3571e4a..1d9fa1d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -1488,6 +1488,9 @@ Release 2.7.2 - UNRELEASED HDFS-9043. Doc updation for commands in HDFS Federation (J.Andreina via vinayakumab) + HDFS-9106. Transfer failure during pipeline recovery causes permanent + write failures (kihwal) + Release 2.7.1 - 2015-07-06 INCOMPATIBLE CHANGES