hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ste...@apache.org
Subject hadoop git commit: HADOOP-13145 In DistCp, prevent unnecessary getFileStatus call when not preserving metadata. Contributed by Chris Nauroth.
Date Sat, 21 May 2016 18:10:30 GMT
Repository: hadoop
Updated Branches:
  refs/heads/branch-2.8 f2aef95bd -> 5930e813c


HADOOP-13145 In DistCp, prevent unnecessary getFileStatus call when not preserving metadata.
Contributed by Chris Nauroth.


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/5930e813
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/5930e813
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/5930e813

Branch: refs/heads/branch-2.8
Commit: 5930e813c5b6603ccf418b54663272ed92f43872
Parents: f2aef95
Author: Steve Loughran <stevel@apache.org>
Authored: Sat May 21 19:10:16 2016 +0100
Committer: Steve Loughran <stevel@apache.org>
Committed: Sat May 21 19:10:16 2016 +0100

----------------------------------------------------------------------
 hadoop-project/pom.xml                          |   6 +
 hadoop-tools/hadoop-aws/pom.xml                 |  11 +
 .../src/site/markdown/tools/hadoop-aws/index.md |   9 +
 .../fs/contract/s3a/TestS3AContractDistCp.java  |  46 +++++
 hadoop-tools/hadoop-azure/pom.xml               |  19 ++
 .../contract/TestAzureNativeContractDistCp.java |  33 +++
 hadoop-tools/hadoop-distcp/pom.xml              |  16 ++
 .../apache/hadoop/tools/util/DistCpUtils.java   |  10 +-
 .../contract/AbstractContractDistCpTest.java    | 207 +++++++++++++++++++
 9 files changed, 354 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/5930e813/hadoop-project/pom.xml
----------------------------------------------------------------------
diff --git a/hadoop-project/pom.xml b/hadoop-project/pom.xml
index 8e99d02..388a375 100644
--- a/hadoop-project/pom.xml
+++ b/hadoop-project/pom.xml
@@ -351,6 +351,12 @@
       </dependency>
       <dependency>
         <groupId>org.apache.hadoop</groupId>
+        <artifactId>hadoop-distcp</artifactId>
+        <version>${project.version}</version>
+        <type>test-jar</type>
+      </dependency>
+      <dependency>
+        <groupId>org.apache.hadoop</groupId>
         <artifactId>hadoop-datajoin</artifactId>
         <version>${project.version}</version>
       </dependency>

http://git-wip-us.apache.org/repos/asf/hadoop/blob/5930e813/hadoop-tools/hadoop-aws/pom.xml
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-aws/pom.xml b/hadoop-tools/hadoop-aws/pom.xml
index 9b43456..a084318 100644
--- a/hadoop-tools/hadoop-aws/pom.xml
+++ b/hadoop-tools/hadoop-aws/pom.xml
@@ -235,5 +235,16 @@
       <scope>test</scope>
       <type>jar</type>
     </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-distcp</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-distcp</artifactId>
+      <scope>test</scope>
+      <type>test-jar</type>
+    </dependency>
   </dependencies>
 </project>

http://git-wip-us.apache.org/repos/asf/hadoop/blob/5930e813/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md
index 010d00e..779639f 100644
--- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md
+++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md
@@ -741,6 +741,15 @@ or in batch runs.
 Smaller values should result in faster test runs, especially when the object
 store is a long way away.
 
+DistCp tests targeting S3A support a configurable file size.  The default is
+10 MB, but the configuration value is expressed in KB so that it can be tuned
+smaller to achieve faster test runs.
+
+      <property>
+        <name>scale.test.distcp.file.size.kb</name>
+        <value>10240</value>
+      </property>
+
 ### Running the Tests
 
 After completing the configuration, execute the test run through Maven.

http://git-wip-us.apache.org/repos/asf/hadoop/blob/5930e813/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/TestS3AContractDistCp.java
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/TestS3AContractDistCp.java
b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/TestS3AContractDistCp.java
new file mode 100644
index 0000000..7eb0afa
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/TestS3AContractDistCp.java
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.contract.s3a;
+
+import static org.apache.hadoop.fs.s3a.Constants.MIN_MULTIPART_THRESHOLD;
+import static org.apache.hadoop.fs.s3a.Constants.MULTIPART_SIZE;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.tools.contract.AbstractContractDistCpTest;
+
+/**
+ * Contract test suite covering S3A integration with DistCp.
+ */
+public class TestS3AContractDistCp extends AbstractContractDistCpTest {
+
+  private static final long MULTIPART_SETTING = 8 * 1024 * 1024; // 8 MB
+
+  @Override
+  protected Configuration createConfiguration() {
+    Configuration newConf = super.createConfiguration();
+    newConf.setLong(MIN_MULTIPART_THRESHOLD, MULTIPART_SETTING);
+    newConf.setLong(MULTIPART_SIZE, MULTIPART_SETTING);
+    return newConf;
+  }
+
+  @Override
+  protected S3AContract createContract(Configuration conf) {
+    return new S3AContract(conf);
+  }
+}

http://git-wip-us.apache.org/repos/asf/hadoop/blob/5930e813/hadoop-tools/hadoop-azure/pom.xml
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-azure/pom.xml b/hadoop-tools/hadoop-azure/pom.xml
index c4bfede..5673cd2 100644
--- a/hadoop-tools/hadoop-azure/pom.xml
+++ b/hadoop-tools/hadoop-azure/pom.xml
@@ -176,6 +176,25 @@
     </dependency>
     
     <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-distcp</artifactId>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-distcp</artifactId>
+      <scope>test</scope>
+      <type>test-jar</type>
+    </dependency>
+
+    <dependency>
       <groupId>org.mockito</groupId>
       <artifactId>mockito-all</artifactId>
       <scope>test</scope>

http://git-wip-us.apache.org/repos/asf/hadoop/blob/5930e813/hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azure/contract/TestAzureNativeContractDistCp.java
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azure/contract/TestAzureNativeContractDistCp.java
b/hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azure/contract/TestAzureNativeContractDistCp.java
new file mode 100644
index 0000000..a3750d4
--- /dev/null
+++ b/hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azure/contract/TestAzureNativeContractDistCp.java
@@ -0,0 +1,33 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.azure.contract;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.tools.contract.AbstractContractDistCpTest;
+
+/**
+ * Contract test suite covering WASB integration with DistCp.
+ */
+public class TestAzureNativeContractDistCp extends AbstractContractDistCpTest {
+
+  @Override
+  protected NativeAzureFileSystemContract createContract(Configuration conf) {
+    return new NativeAzureFileSystemContract(conf);
+  }
+}

http://git-wip-us.apache.org/repos/asf/hadoop/blob/5930e813/hadoop-tools/hadoop-distcp/pom.xml
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-distcp/pom.xml b/hadoop-tools/hadoop-distcp/pom.xml
index eeec27d..1ae01d4 100644
--- a/hadoop-tools/hadoop-distcp/pom.xml
+++ b/hadoop-tools/hadoop-distcp/pom.xml
@@ -175,6 +175,22 @@
             </manifest>
           </archive>
         </configuration>
+        <executions>
+          <execution>
+            <id>prepare-jar</id>
+            <phase>prepare-package</phase>
+            <goals>
+              <goal>jar</goal>
+            </goals>
+          </execution>
+          <execution>
+            <id>prepare-test-jar</id>
+            <phase>prepare-package</phase>
+            <goals>
+              <goal>test-jar</goal>
+            </goals>
+          </execution>
+        </executions>
       </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>

http://git-wip-us.apache.org/repos/asf/hadoop/blob/5930e813/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/DistCpUtils.java
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/DistCpUtils.java
b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/DistCpUtils.java
index d3d7677..1784c5d 100644
--- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/DistCpUtils.java
+++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/DistCpUtils.java
@@ -195,9 +195,13 @@ public class DistCpUtils {
                               EnumSet<FileAttribute> attributes,
                               boolean preserveRawXattrs) throws IOException {
 
-    FileStatus targetFileStatus = targetFS.getFileStatus(path);
-    String group = targetFileStatus.getGroup();
-    String user = targetFileStatus.getOwner();
+    // If not preserving anything from FileStatus, don't bother fetching it.
+    FileStatus targetFileStatus = attributes.isEmpty() ? null :
+        targetFS.getFileStatus(path);
+    String group = targetFileStatus == null ? null :
+        targetFileStatus.getGroup();
+    String user = targetFileStatus == null ? null :
+        targetFileStatus.getOwner();
     boolean chown = false;
 
     if (attributes.contains(FileAttribute.ACL)) {

http://git-wip-us.apache.org/repos/asf/hadoop/blob/5930e813/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/contract/AbstractContractDistCpTest.java
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/contract/AbstractContractDistCpTest.java
b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/contract/AbstractContractDistCpTest.java
new file mode 100644
index 0000000..7aeab55
--- /dev/null
+++ b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/contract/AbstractContractDistCpTest.java
@@ -0,0 +1,207 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools.contract;
+
+import static org.apache.hadoop.fs.contract.ContractTestUtils.*;
+import static org.junit.Assert.*;
+
+import java.io.File;
+import java.util.Arrays;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.contract.AbstractFSContractTestBase;
+import org.apache.hadoop.fs.contract.ContractTestUtils;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.tools.DistCp;
+import org.apache.hadoop.tools.DistCpOptions;
+
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+
+/**
+ * Contract test suite covering a file system's integration with DistCp.  The
+ * tests coordinate two file system instances: one "local", which is the local
+ * file system, and the other "remote", which is the file system implementation
+ * under test.  The tests in the suite cover both copying from local to remote
+ * (e.g. a backup use case) and copying from remote to local (e.g. a restore use
+ * case).
+ */
+public abstract class AbstractContractDistCpTest
+    extends AbstractFSContractTestBase {
+
+  @Rule
+  public TestName testName = new TestName();
+
+  private Configuration conf;
+  private FileSystem localFS, remoteFS;
+  private Path localDir, remoteDir;
+
+  @Override
+  protected Configuration createConfiguration() {
+    Configuration newConf = new Configuration();
+    newConf.set("mapred.job.tracker", "local");
+    return newConf;
+  }
+
+  @Before
+  @Override
+  public void setup() throws Exception {
+    super.setup();
+    conf = getContract().getConf();
+    localFS = FileSystem.getLocal(conf);
+    remoteFS = getFileSystem();
+    // Test paths are isolated by concrete subclass name and test method name.
+    // All paths are fully qualified including scheme (not taking advantage of
+    // default file system), so if something fails, the messages will make it
+    // clear which paths are local and which paths are remote.
+    Path testSubDir = new Path(getClass().getSimpleName(),
+        testName.getMethodName());
+    String testDirProp = System.getProperty("test.build.data",
+        "target" + File.separator + "test" + File.separator + "data");
+    File testDir = new File(testDirProp).getAbsoluteFile();
+    localDir = localFS.makeQualified(new Path(new Path(
+        testDir.toURI()), testSubDir));
+    mkdirs(localFS, localDir);
+    remoteDir = remoteFS.makeQualified(
+        new Path(getContract().getTestPath(), testSubDir));
+    mkdirs(remoteFS, remoteDir);
+  }
+
+  @Test
+  public void deepDirectoryStructureToRemote() throws Exception {
+    describe("copy a deep directory structure from local to remote");
+    deepDirectoryStructure(localFS, localDir, remoteFS, remoteDir);
+  }
+
+  @Test
+  public void largeFilesToRemote() throws Exception {
+    describe("copy multiple large files from local to remote");
+    largeFiles(localFS, localDir, remoteFS, remoteDir);
+  }
+
+  @Test
+  public void deepDirectoryStructureFromRemote() throws Exception {
+    describe("copy a deep directory structure from remote to local");
+    deepDirectoryStructure(remoteFS, remoteDir, localFS, localDir);
+  }
+
+  @Test
+  public void largeFilesFromRemote() throws Exception {
+    describe("copy multiple large files from remote to local");
+    largeFiles(remoteFS, remoteDir, localFS, localDir);
+  }
+
+  /**
+   * Executes a test using a file system sub-tree with multiple nesting levels.
+   *
+   * @param srcFS source FileSystem
+   * @param srcDir source directory
+   * @param dstFS destination FileSystem
+   * @param dstDir destination directory
+   * @throws Exception if there is a failure
+   */
+  private void deepDirectoryStructure(FileSystem srcFS, Path srcDir,
+      FileSystem dstFS, Path dstDir) throws Exception {
+    Path inputDir = new Path(srcDir, "inputDir");
+    Path inputSubDir1 = new Path(inputDir, "subDir1");
+    Path inputSubDir2 = new Path(inputDir, "subDir2/subDir3");
+    Path inputFile1 = new Path(inputDir, "file1");
+    Path inputFile2 = new Path(inputSubDir1, "file2");
+    Path inputFile3 = new Path(inputSubDir2, "file3");
+    mkdirs(srcFS, inputSubDir1);
+    mkdirs(srcFS, inputSubDir2);
+    byte[] data1 = dataset(100, 33, 43);
+    createFile(srcFS, inputFile1, true, data1);
+    byte[] data2 = dataset(200, 43, 53);
+    createFile(srcFS, inputFile2, true, data2);
+    byte[] data3 = dataset(300, 53, 63);
+    createFile(srcFS, inputFile3, true, data3);
+    Path target = new Path(dstDir, "outputDir");
+    runDistCp(inputDir, target);
+    ContractTestUtils.assertIsDirectory(dstFS, target);
+    verifyFileContents(dstFS, new Path(target, "inputDir/file1"), data1);
+    verifyFileContents(dstFS,
+        new Path(target, "inputDir/subDir1/file2"), data2);
+    verifyFileContents(dstFS,
+        new Path(target, "inputDir/subDir2/subDir3/file3"), data3);
+  }
+
+  /**
+   * Executes a test using multiple large files.
+   *
+   * @param srcFS source FileSystem
+   * @param srcDir source directory
+   * @param dstFS destination FileSystem
+   * @param dstDir destination directory
+   * @throws Exception if there is a failure
+   */
+  private void largeFiles(FileSystem srcFS, Path srcDir, FileSystem dstFS,
+      Path dstDir) throws Exception {
+    Path inputDir = new Path(srcDir, "inputDir");
+    Path inputFile1 = new Path(inputDir, "file1");
+    Path inputFile2 = new Path(inputDir, "file2");
+    Path inputFile3 = new Path(inputDir, "file3");
+    mkdirs(srcFS, inputDir);
+    int fileSizeKb = conf.getInt("scale.test.distcp.file.size.kb", 10 * 1024);
+    int fileSizeMb = fileSizeKb * 1024;
+    getLog().info("{} with file size {}", testName.getMethodName(), fileSizeMb);
+    byte[] data1 = dataset((fileSizeMb + 1) * 1024 * 1024, 33, 43);
+    createFile(srcFS, inputFile1, true, data1);
+    byte[] data2 = dataset((fileSizeMb + 2) * 1024 * 1024, 43, 53);
+    createFile(srcFS, inputFile2, true, data2);
+    byte[] data3 = dataset((fileSizeMb + 3) * 1024 * 1024, 53, 63);
+    createFile(srcFS, inputFile3, true, data3);
+    Path target = new Path(dstDir, "outputDir");
+    runDistCp(inputDir, target);
+    ContractTestUtils.assertIsDirectory(dstFS, target);
+    verifyFileContents(dstFS, new Path(target, "inputDir/file1"), data1);
+    verifyFileContents(dstFS, new Path(target, "inputDir/file2"), data2);
+    verifyFileContents(dstFS, new Path(target, "inputDir/file3"), data3);
+  }
+
+  /**
+   * Executes DistCp and asserts that the job finished successfully.
+   *
+   * @param src source path
+   * @param dst destination path
+   * @throws Exception if there is a failure
+   */
+  private void runDistCp(Path src, Path dst) throws Exception {
+    DistCpOptions options = new DistCpOptions(Arrays.asList(src), dst);
+    Job job = new DistCp(conf, options).execute();
+    assertNotNull("Unexpected null job returned from DistCp execution.", job);
+    assertTrue("DistCp job did not complete.", job.isComplete());
+    assertTrue("DistCp job did not complete successfully.", job.isSuccessful());
+  }
+
+  /**
+   * Creates a directory and any ancestor directories required.
+   *
+   * @param fs FileSystem in which to create directories
+   * @param dir path of directory to create
+   * @throws Exception if there is a failure
+   */
+  private static void mkdirs(FileSystem fs, Path dir) throws Exception {
+    assertTrue("Failed to mkdir " + dir, fs.mkdirs(dir));
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org


Mime
View raw message