Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 07D7F200C88 for ; Fri, 19 May 2017 00:56:47 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id 06443160BC4; Thu, 18 May 2017 22:56:47 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id F2EC7160BB5 for ; Fri, 19 May 2017 00:56:45 +0200 (CEST) Received: (qmail 92827 invoked by uid 500); 18 May 2017 22:56:45 -0000 Mailing-List: contact common-commits-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Delivered-To: mailing list common-commits@hadoop.apache.org Received: (qmail 92817 invoked by uid 99); 18 May 2017 22:56:45 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 18 May 2017 22:56:45 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id F1373DFF93; Thu, 18 May 2017 22:56:44 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: yjzhangal@apache.org To: common-commits@hadoop.apache.org Message-Id: <6e88332e4a4949208bb55f6b2ec2b94f@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: hadoop git commit: HADOOP-14407. DistCp - Introduce a configurable copy buffer size. (Omkar Aradhya K S via Yongjun Zhang) Date: Thu, 18 May 2017 22:56:44 +0000 (UTC) archived-at: Thu, 18 May 2017 22:56:47 -0000 Repository: hadoop Updated Branches: refs/heads/trunk 40e6a85d2 -> b4adc8392 HADOOP-14407. DistCp - Introduce a configurable copy buffer size. (Omkar Aradhya K S via Yongjun Zhang) Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/b4adc839 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/b4adc839 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/b4adc839 Branch: refs/heads/trunk Commit: b4adc8392c1314d6d6fbdd00f2afb306ef20a650 Parents: 40e6a85 Author: Yongjun Zhang Authored: Thu May 18 15:35:22 2017 -0700 Committer: Yongjun Zhang Committed: Thu May 18 15:35:22 2017 -0700 ---------------------------------------------------------------------- .../apache/hadoop/tools/DistCpConstants.java | 6 +++++ .../org/apache/hadoop/tools/DistCpContext.java | 4 ++++ .../apache/hadoop/tools/DistCpOptionSwitch.java | 8 +++++++ .../org/apache/hadoop/tools/DistCpOptions.java | 23 +++++++++++++++++++- .../org/apache/hadoop/tools/OptionsParser.java | 12 ++++++++++ .../tools/mapred/RetriableFileCopyCommand.java | 11 ++++++---- .../src/site/markdown/DistCp.md.vm | 1 + .../apache/hadoop/tools/TestDistCpOptions.java | 19 +++++++++++++++- 8 files changed, 78 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/b4adc839/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpConstants.java ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpConstants.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpConstants.java index ff16e44..7e8e67d 100644 --- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpConstants.java +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpConstants.java @@ -109,6 +109,10 @@ public class DistCpConstants { /* DistCp CopyListing class override param */ public static final String CONF_LABEL_COPY_LISTING_CLASS = "distcp.copy.listing.class"; + /* DistCp Copy Buffer Size */ + public static final String CONF_LABEL_COPY_BUFFER_SIZE = + "distcp.copy.buffer.size"; + /** * Constants for DistCp return code to shell / consumer of ToolRunner's run */ @@ -141,4 +145,6 @@ public class DistCpConstants { public static final String HDFS_RESERVED_RAW_DIRECTORY_NAME = "/.reserved/raw"; static final String HDFS_DISTCP_DIFF_DIRECTORY_NAME = ".distcp.diff.tmp"; + + public static final int COPY_BUFFER_SIZE_DEFAULT = 8 * 1024; } http://git-wip-us.apache.org/repos/asf/hadoop/blob/b4adc839/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpContext.java ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpContext.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpContext.java index c34005e..fc047ca 100644 --- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpContext.java +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpContext.java @@ -175,6 +175,10 @@ public class DistCpContext { return options.getBlocksPerChunk() > 0; } + public int getCopyBufferSize() { + return options.getCopyBufferSize(); + } + public void setTargetPathExists(boolean targetPathExists) { this.targetPathExists = targetPathExists; } http://git-wip-us.apache.org/repos/asf/hadoop/blob/b4adc839/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptionSwitch.java ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptionSwitch.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptionSwitch.java index 81abb7d..016172e0 100644 --- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptionSwitch.java +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptionSwitch.java @@ -180,6 +180,14 @@ public enum DistCpOptionSwitch { + "system implements concat method")), /** + * Configurable copy buffer size. + */ + COPY_BUFFER_SIZE(DistCpConstants.CONF_LABEL_COPY_BUFFER_SIZE, + new Option("copybuffersize", true, "Size of the copy buffer to use. " + + "By default is " + + DistCpConstants.COPY_BUFFER_SIZE_DEFAULT + "B.")), + + /** * Specify bandwidth per map in MB, accepts bandwidth as a fraction */ BANDWIDTH(DistCpConstants.CONF_LABEL_BANDWIDTH_MB, http://git-wip-us.apache.org/repos/asf/hadoop/blob/b4adc839/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptions.java ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptions.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptions.java index 97ae0c4..af6cb8b 100644 --- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptions.java +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptions.java @@ -143,6 +143,8 @@ public final class DistCpOptions { // to copy in parallel. Default is 0 and file are not splitted. private final int blocksPerChunk; + private final int copyBufferSize; + /** * File attributes for preserve. * @@ -200,6 +202,8 @@ public final class DistCpOptions { this.preserveStatus = builder.preserveStatus; this.blocksPerChunk = builder.blocksPerChunk; + + this.copyBufferSize = builder.copyBufferSize; } public Path getSourceFileListing() { @@ -302,7 +306,7 @@ public final class DistCpOptions { } /** - * Checks if the input attribute should be preserved or not + * Checks if the input attribute should be preserved or not. * * @param attribute - Attribute to check * @return True if attribute should be preserved, false otherwise @@ -315,6 +319,10 @@ public final class DistCpOptions { return blocksPerChunk; } + public int getCopyBufferSize() { + return copyBufferSize; + } + /** * Add options to configuration. These will be used in the Mapper/committer * @@ -351,6 +359,8 @@ public final class DistCpOptions { } DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.BLOCKS_PER_CHUNK, String.valueOf(blocksPerChunk)); + DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.COPY_BUFFER_SIZE, + String.valueOf(copyBufferSize)); } /** @@ -385,6 +395,7 @@ public final class DistCpOptions { ", targetPath=" + targetPath + ", filtersFile='" + filtersFile + '\'' + ", blocksPerChunk=" + blocksPerChunk + + ", copyBufferSize=" + copyBufferSize + '}'; } @@ -429,6 +440,9 @@ public final class DistCpOptions { private int blocksPerChunk = 0; + private int copyBufferSize = + DistCpConstants.COPY_BUFFER_SIZE_DEFAULT; + public Builder(List sourcePaths, Path targetPath) { Preconditions.checkArgument(sourcePaths != null && !sourcePaths.isEmpty(), "Source paths should not be null or empty!"); @@ -664,6 +678,13 @@ public final class DistCpOptions { this.blocksPerChunk = newBlocksPerChunk; return this; } + + public Builder withCopyBufferSize(int newCopyBufferSize) { + this.copyBufferSize = + newCopyBufferSize > 0 ? newCopyBufferSize + : DistCpConstants.COPY_BUFFER_SIZE_DEFAULT; + return this; + } } } http://git-wip-us.apache.org/repos/asf/hadoop/blob/b4adc839/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/OptionsParser.java ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/OptionsParser.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/OptionsParser.java index 21ff0f8..96fb1d9 100644 --- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/OptionsParser.java +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/OptionsParser.java @@ -213,6 +213,18 @@ public class OptionsParser { } } + if (command.hasOption(DistCpOptionSwitch.COPY_BUFFER_SIZE.getSwitch())) { + final String copyBufferSizeStr = getVal(command, + DistCpOptionSwitch.COPY_BUFFER_SIZE.getSwitch().trim()); + try { + int copyBufferSize = Integer.parseInt(copyBufferSizeStr); + builder.withCopyBufferSize(copyBufferSize); + } catch (NumberFormatException e) { + throw new IllegalArgumentException("copyBufferSize is invalid: " + + copyBufferSizeStr, e); + } + } + return builder.build(); } http://git-wip-us.apache.org/repos/asf/hadoop/blob/b4adc839/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/RetriableFileCopyCommand.java ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/RetriableFileCopyCommand.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/RetriableFileCopyCommand.java index 2c17fef..21f621a 100644 --- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/RetriableFileCopyCommand.java +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/RetriableFileCopyCommand.java @@ -38,6 +38,7 @@ import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.tools.CopyListingFileStatus; import org.apache.hadoop.tools.DistCpConstants; +import org.apache.hadoop.tools.DistCpOptionSwitch; import org.apache.hadoop.tools.DistCpOptions.FileAttribute; import org.apache.hadoop.tools.mapred.CopyMapper.FileAction; import org.apache.hadoop.tools.util.DistCpUtils; @@ -53,7 +54,6 @@ import com.google.common.annotations.VisibleForTesting; public class RetriableFileCopyCommand extends RetriableCommand { private static Log LOG = LogFactory.getLog(RetriableFileCopyCommand.class); - private static int BUFFER_SIZE = 8 * 1024; private boolean skipCrc = false; private FileAction action; @@ -169,6 +169,9 @@ public class RetriableFileCopyCommand extends RetriableCommand { throws IOException { FsPermission permission = FsPermission.getFileDefault().applyUMask( FsPermission.getUMask(targetFS.getConf())); + int copyBufferSize = context.getConfiguration().getInt( + DistCpOptionSwitch.COPY_BUFFER_SIZE.getConfigLabel(), + DistCpConstants.COPY_BUFFER_SIZE_DEFAULT); final OutputStream outStream; if (action == FileAction.OVERWRITE) { // If there is an erasure coding policy set on the target directory, @@ -180,14 +183,14 @@ public class RetriableFileCopyCommand extends RetriableCommand { targetFS, targetPath); FSDataOutputStream out = targetFS.create(targetPath, permission, EnumSet.of(CreateFlag.CREATE, CreateFlag.OVERWRITE), - BUFFER_SIZE, repl, blockSize, context, + copyBufferSize, repl, blockSize, context, getChecksumOpt(fileAttributes, sourceChecksum)); outStream = new BufferedOutputStream(out); } else { outStream = new BufferedOutputStream(targetFS.append(targetPath, - BUFFER_SIZE)); + copyBufferSize)); } - return copyBytes(source, sourceOffset, outStream, BUFFER_SIZE, + return copyBytes(source, sourceOffset, outStream, copyBufferSize, context); } http://git-wip-us.apache.org/repos/asf/hadoop/blob/b4adc839/hadoop-tools/hadoop-distcp/src/site/markdown/DistCp.md.vm ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-distcp/src/site/markdown/DistCp.md.vm b/hadoop-tools/hadoop-distcp/src/site/markdown/DistCp.md.vm index a77deb2..ee0a93e 100644 --- a/hadoop-tools/hadoop-distcp/src/site/markdown/DistCp.md.vm +++ b/hadoop-tools/hadoop-distcp/src/site/markdown/DistCp.md.vm @@ -238,6 +238,7 @@ Flag | Description | Notes `-numListstatusThreads` | Number of threads to use for building file listing | At most 40 threads. `-skipcrccheck` | Whether to skip CRC checks between source and target paths. | `-blocksperchunk ` | Number of blocks per chunk. When specified, split files into chunks to copy in parallel | If set to a positive value, files with more blocks than this value will be split into chunks of `` blocks to be transferred in parallel, and reassembled on the destination. By default, `` is 0 and the files will be transmitted in their entirety without splitting. This switch is only applicable when the source file system implements getBlockLocations method and the target file system implements concat method. | +`-copybuffersize ` | Size of the copy buffer to use. By default, `` is set to 8192B | Architecture of DistCp ---------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/b4adc839/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestDistCpOptions.java ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestDistCpOptions.java b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestDistCpOptions.java index 3525194..6b59b97 100644 --- a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestDistCpOptions.java +++ b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestDistCpOptions.java @@ -287,7 +287,7 @@ public class TestDistCpOptions { "mapBandwidth=0.0, copyStrategy='uniformsize', preserveStatus=[], " + "atomicWorkPath=null, logPath=null, sourceFileListing=abc, " + "sourcePaths=null, targetPath=xyz, filtersFile='null'," + - " blocksPerChunk=0}"; + " blocksPerChunk=0, copyBufferSize=8192}"; String optionString = option.toString(); Assert.assertEquals(val, optionString); Assert.assertNotSame(DistCpOptionSwitch.ATOMIC_COMMIT.toString(), @@ -497,4 +497,21 @@ public class TestDistCpOptions { Assert.assertFalse(builder.build().shouldAppend()); } + @Test + public void testSetCopyBufferSize() { + final DistCpOptions.Builder builder = new DistCpOptions.Builder( + Collections.singletonList(new Path("hdfs://localhost:8020/source")), + new Path("hdfs://localhost:8020/target/")); + + Assert.assertEquals(DistCpConstants.COPY_BUFFER_SIZE_DEFAULT, + builder.build().getCopyBufferSize()); + + builder.withCopyBufferSize(4194304); + Assert.assertEquals(4194304, + builder.build().getCopyBufferSize()); + + builder.withCopyBufferSize(-1); + Assert.assertEquals(DistCpConstants.COPY_BUFFER_SIZE_DEFAULT, + builder.build().getCopyBufferSize()); + } } --------------------------------------------------------------------- To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org For additional commands, e-mail: common-commits-help@hadoop.apache.org