hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ste...@apache.org
Subject [hadoop] branch trunk updated: HDFS-14788. Use dynamic regex filter to ignore copy of source files in Distcp.
Date Mon, 06 Jan 2020 19:11:01 GMT
This is an automated email from the ASF dual-hosted git repository.

stevel pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/hadoop.git


The following commit(s) were added to refs/heads/trunk by this push:
     new 819159f  HDFS-14788. Use dynamic regex filter to ignore copy of source files in Distcp.
819159f is described below

commit 819159fa060897bcf7c9ae09bf4b2fc97292f92b
Author: Mukund Thakur <mthakur@cloudera.com>
AuthorDate: Mon Jan 6 19:09:07 2020 +0000

    HDFS-14788. Use dynamic regex filter to ignore copy of source files in Distcp.
    
    Contributed by Mukund Thakur.
    
    Change-Id: I781387ddce95ee300c12a160dc9a0f7d602403c3
---
 .../java/org/apache/hadoop/tools/CopyFilter.java   | 31 +++++++
 .../org/apache/hadoop/tools/DistCpConstants.java   | 14 ++++
 .../hadoop/tools/RegexpInConfigurationFilter.java  | 72 ++++++++++++++++
 .../hadoop-distcp/src/site/markdown/DistCp.md.vm   | 17 ++++
 .../org/apache/hadoop/tools/TestCopyFilter.java    | 97 ++++++++++++++++++++++
 .../tools/TestRegexpInConfigurationFilter.java     | 55 ++++++++++++
 6 files changed, 286 insertions(+)

diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/CopyFilter.java
b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/CopyFilter.java
index 4b348a5..f5f00f1 100644
--- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/CopyFilter.java
+++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/CopyFilter.java
@@ -17,6 +17,11 @@
  */
 package org.apache.hadoop.tools;
 
+import java.lang.reflect.Constructor;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 
@@ -26,6 +31,8 @@ import org.apache.hadoop.fs.Path;
  */
 public abstract class CopyFilter {
 
+  private static final Logger LOG = LoggerFactory.getLogger(CopyFilter.class);
+
   /**
    * Default initialize method does nothing.
    */
@@ -47,6 +54,30 @@ public abstract class CopyFilter {
    * @return An instance of the appropriate CopyFilter
    */
   public static CopyFilter getCopyFilter(Configuration conf) {
+    String filtersClassName = conf
+            .get(DistCpConstants.CONF_LABEL_FILTERS_CLASS);
+    if (filtersClassName != null) {
+      try {
+        Class<? extends CopyFilter> filtersClass = conf
+                .getClassByName(filtersClassName)
+                .asSubclass(CopyFilter.class);
+        filtersClassName = filtersClass.getName();
+        Constructor<? extends CopyFilter> constructor = filtersClass
+                .getDeclaredConstructor(Configuration.class);
+        return constructor.newInstance(conf);
+      } catch (Exception e) {
+        LOG.error(DistCpConstants.CLASS_INSTANTIATION_ERROR_MSG +
+                filtersClassName, e);
+        throw new RuntimeException(
+                DistCpConstants.CLASS_INSTANTIATION_ERROR_MSG +
+                        filtersClassName, e);
+      }
+    } else {
+      return getDefaultCopyFilter(conf);
+    }
+  }
+
+  private static CopyFilter getDefaultCopyFilter(Configuration conf) {
     String filtersFilename = conf.get(DistCpConstants.CONF_LABEL_FILTERS_FILE);
 
     if (filtersFilename == null) {
diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpConstants.java
b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpConstants.java
index f0adc78..2581568 100644
--- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpConstants.java
+++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpConstants.java
@@ -120,6 +120,17 @@ public final class DistCpConstants {
   /* DistCp CopyListing class override param */
   public static final String CONF_LABEL_COPY_LISTING_CLASS = "distcp.copy.listing.class";
 
+  /**
+   *  DistCp Filter class override param.
+   */
+  public static final String CONF_LABEL_FILTERS_CLASS = "distcp.filters.class";
+
+  /**
+   *  Distcp exclude file regex override param.
+   */
+  public static final String DISTCP_EXCLUDE_FILE_REGEX =
+          "distcp.exclude-file-regex";
+
   /* DistCp Copy Buffer Size */
   public static final String CONF_LABEL_COPY_BUFFER_SIZE =
       "distcp.copy.buffer.size";
@@ -177,4 +188,7 @@ public final class DistCpConstants {
 
   public static final String CHECKSUM_MISMATCH_ERROR_MSG =
           "Checksum mismatch between ";
+
+  public static final String CLASS_INSTANTIATION_ERROR_MSG =
+          "Unable to instantiate ";
 }
diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/RegexpInConfigurationFilter.java
b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/RegexpInConfigurationFilter.java
new file mode 100644
index 0000000..4bf62e2
--- /dev/null
+++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/RegexpInConfigurationFilter.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+
+
+/**
+ * Implementation of regex based filter for DistCp.
+ * {@link DistCpConstants#CONF_LABEL_FILTERS_CLASS} needs to be set
+ * in {@link Configuration} when launching a distcp job.
+ */
+public class RegexpInConfigurationFilter extends CopyFilter {
+
+  private static final Logger LOG = LoggerFactory
+          .getLogger(RegexpInConfigurationFilter.class);
+
+  /**
+   * Regex which can used to filter source files.
+   * {@link DistCpConstants#DISTCP_EXCLUDE_FILE_REGEX} can be set
+   * in {@link Configuration} when launching a DistCp job.
+   * If not set no files will be filtered.
+   */
+  private String excludeFileRegex;
+
+  private List<Pattern> filters = new ArrayList<>();
+
+  protected RegexpInConfigurationFilter(Configuration conf) {
+    excludeFileRegex = conf
+            .getTrimmed(DistCpConstants.DISTCP_EXCLUDE_FILE_REGEX, "");
+    if (!excludeFileRegex.isEmpty()) {
+      Pattern pattern = Pattern.compile(excludeFileRegex);
+      filters.add(pattern);
+    }
+  }
+
+  @Override
+  public boolean shouldCopy(Path path) {
+    for (Pattern filter : filters) {
+      if (filter.matcher(path.toString()).matches()) {
+        LOG.debug("Skipping {} as it matches the filter regex",
+                path.toString());
+        return false;
+      }
+    }
+    return true;
+  }
+}
diff --git a/hadoop-tools/hadoop-distcp/src/site/markdown/DistCp.md.vm b/hadoop-tools/hadoop-distcp/src/site/markdown/DistCp.md.vm
index a5c4011..bf5b891 100644
--- a/hadoop-tools/hadoop-distcp/src/site/markdown/DistCp.md.vm
+++ b/hadoop-tools/hadoop-distcp/src/site/markdown/DistCp.md.vm
@@ -440,6 +440,23 @@ $H3 Copy-listing Generator
   of DistCp differs here from the legacy DistCp, in how paths are considered
   for copy.
 
+  One may also customize the filtering of files which shouldn't be copied
+  by passing the current supported implementation of CopyFilter interface
+  or a new implementation can be written. This can be specified by setting the
+  `distcp.filters.class` in the DistCpOptions:
+
+  1. `distcp.filters.class` to "RegexCopyFilter". If you are using this implementation,
+     you will have to pass along "CopyFilter" `distcp.filters.file` which contains the
+     regex used for filtering. Support regular expressions specified by
+     java.util.regex.Pattern.
+  2. `distcp.filters.class` to "RegexpInConfigurationFilter". If you are using this
+     implementation, you will have to pass along the regex also using
+     `distcp.exclude-file-regex` parameter in "DistCpOptions". Support regular
+     expressions specified by java.util.regex.Pattern. This is a more dynamic approach
+     as compared to "RegexCopyFilter".
+  3. `distcp.filters.class` to "TrueCopyFilter". This is used as a default
+     implementation if none of the above options are specified.
+
   The legacy implementation only lists those paths that must definitely be
   copied on to target. E.g. if a file already exists at the target (and
   `-overwrite` isn't specified), the file isn't even considered in the
diff --git a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestCopyFilter.java
b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestCopyFilter.java
new file mode 100644
index 0000000..4d36f38
--- /dev/null
+++ b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestCopyFilter.java
@@ -0,0 +1,97 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools;
+
+import org.junit.Test;
+
+import org.apache.hadoop.conf.Configuration;
+
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Test {@link CopyFilter}.
+ */
+public class TestCopyFilter {
+
+  @Test
+  public void testGetCopyFilterTrueCopyFilter() {
+    Configuration configuration = new Configuration(false);
+    CopyFilter copyFilter = CopyFilter.getCopyFilter(configuration);
+    assertTrue("copyFilter should be instance of TrueCopyFilter",
+            copyFilter instanceof TrueCopyFilter);
+  }
+
+  @Test
+  public void testGetCopyFilterRegexCopyFilter() {
+    Configuration configuration = new Configuration(false);
+    configuration.set(DistCpConstants.CONF_LABEL_FILTERS_FILE, "random");
+    CopyFilter copyFilter = CopyFilter.getCopyFilter(configuration);
+    assertTrue("copyFilter should be instance of RegexCopyFilter",
+            copyFilter instanceof RegexCopyFilter);
+  }
+
+  @Test
+  public void testGetCopyFilterRegexpInConfigurationFilter() {
+    final String filterName =
+            "org.apache.hadoop.tools.RegexpInConfigurationFilter";
+    Configuration configuration = new Configuration(false);
+    configuration.set(DistCpConstants.CONF_LABEL_FILTERS_CLASS, filterName);
+    CopyFilter copyFilter = CopyFilter.getCopyFilter(configuration);
+    assertTrue("copyFilter should be instance of RegexpInConfigurationFilter",
+            copyFilter instanceof RegexpInConfigurationFilter);
+  }
+
+  @Test
+  public void testGetCopyFilterNonExistingClass() throws Exception {
+    final String filterName =
+            "org.apache.hadoop.tools.RegexpInConfigurationWrongFilter";
+    Configuration configuration = new Configuration(false);
+    configuration.set(DistCpConstants.CONF_LABEL_FILTERS_CLASS, filterName);
+    intercept(RuntimeException.class,
+        DistCpConstants.CLASS_INSTANTIATION_ERROR_MSG + filterName,
+        () -> CopyFilter.getCopyFilter(configuration));
+  }
+
+  @Test
+  public void testGetCopyFilterWrongClassType() throws Exception {
+    final String filterName =
+            "org.apache.hadoop.tools." +
+                    "TestCopyFilter.FilterNotExtendingCopyFilter";
+    Configuration configuration = new Configuration(false);
+    configuration.set(DistCpConstants.CONF_LABEL_FILTERS_CLASS, filterName);
+    intercept(RuntimeException.class,
+        DistCpConstants.CLASS_INSTANTIATION_ERROR_MSG + filterName,
+        () -> CopyFilter.getCopyFilter(configuration));
+  }
+
+  @Test
+  public void testGetCopyFilterEmptyString() throws Exception {
+    final String filterName = "";
+    Configuration configuration = new Configuration(false);
+    configuration.set(DistCpConstants.CONF_LABEL_FILTERS_CLASS, filterName);
+    intercept(RuntimeException.class,
+        DistCpConstants.CLASS_INSTANTIATION_ERROR_MSG + filterName,
+        () -> CopyFilter.getCopyFilter(configuration));
+  }
+
+  private class FilterNotExtendingCopyFilter {
+
+  }
+}
diff --git a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestRegexpInConfigurationFilter.java
b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestRegexpInConfigurationFilter.java
new file mode 100644
index 0000000..8cf0620
--- /dev/null
+++ b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestRegexpInConfigurationFilter.java
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools;
+
+import org.junit.Test;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Test {@link RegexpInConfigurationFilter}.
+ */
+public class TestRegexpInConfigurationFilter {
+
+  @Test
+  public void testShouldCopy() {
+    Configuration configuration = new Configuration(false);
+    configuration.set(DistCpConstants.DISTCP_EXCLUDE_FILE_REGEX,
+            "\\/.*_COPYING_$|\\/.*_COPYING$|^.*\\/\\.[^\\/]*$|" +
+                    "\\/_temporary$|\\/\\_temporary\\/|.*\\/\\.Trash\\/.*");
+    RegexpInConfigurationFilter defaultCopyFilter =
+            new RegexpInConfigurationFilter(configuration);
+    Path shouldCopyPath = new Path("/user/bar");
+    assertTrue(shouldCopyPath.toString() + " should be copied",
+            defaultCopyFilter.shouldCopy(shouldCopyPath));
+    shouldCopyPath = new Path("/user/bar/_COPYING");
+    assertFalse(shouldCopyPath.toString() + " shouldn't be copied",
+            defaultCopyFilter.shouldCopy(shouldCopyPath));
+    shouldCopyPath = new Path("/user/bar/_COPYING_");
+    assertFalse(shouldCopyPath.toString() + " shouldn't be copied",
+            defaultCopyFilter.shouldCopy(shouldCopyPath));
+    shouldCopyPath = new Path("/temp/");
+    assertTrue(shouldCopyPath.toString() + " should be copied",
+            defaultCopyFilter.shouldCopy(shouldCopyPath));
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org


Mime
View raw message