parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jul...@apache.org
Subject git commit: PARQUET-107: Add option to disable summary metadata.
Date Wed, 01 Oct 2014 21:14:35 GMT
Repository: incubator-parquet-mr
Updated Branches:
  refs/heads/master da9129927 -> be1222ef4


PARQUET-107: Add option to disable summary metadata.

This adds an option to the commitJob phase of the MR OutputCommitter,
parquet.enable.summary-metadata (default true), that can be used to
disable the summary metadata files generated from the footers of all of
the files produced. This enables more control over when those summary
files are produced and makes it possible to rename MR outputs and then
generate the summaries.

Author: Ryan Blue <rblue@cloudera.com>

Closes #68 from rdblue/PARQUET-107-add-summary-metadata-option and squashes the following
commits:

261e5e4 [Ryan Blue] PARQUET-107: Add option to disable summary metadata.


Project: http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/commit/be1222ef
Tree: http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/tree/be1222ef
Diff: http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/diff/be1222ef

Branch: refs/heads/master
Commit: be1222ef4a3260ddcf516d73c6ceecd144a134cb
Parents: da91299
Author: Ryan Blue <rblue@cloudera.com>
Authored: Wed Oct 1 14:14:24 2014 -0700
Committer: julien <julien@twitter.com>
Committed: Wed Oct 1 14:14:24 2014 -0700

----------------------------------------------------------------------
 .../parquet/hadoop/ParquetOutputCommitter.java  | 26 +++++++++++---------
 .../parquet/hadoop/ParquetOutputFormat.java     |  5 ++++
 2 files changed, 19 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/blob/be1222ef/parquet-hadoop/src/main/java/parquet/hadoop/ParquetOutputCommitter.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/main/java/parquet/hadoop/ParquetOutputCommitter.java b/parquet-hadoop/src/main/java/parquet/hadoop/ParquetOutputCommitter.java
index 940b893..6ad55f9 100644
--- a/parquet-hadoop/src/main/java/parquet/hadoop/ParquetOutputCommitter.java
+++ b/parquet-hadoop/src/main/java/parquet/hadoop/ParquetOutputCommitter.java
@@ -41,22 +41,24 @@ public class ParquetOutputCommitter extends FileOutputCommitter {
 
   public void commitJob(JobContext jobContext) throws IOException {
     super.commitJob(jobContext);
-    try {
-      Configuration configuration = ContextUtil.getConfiguration(jobContext);
-      final FileSystem fileSystem = outputPath.getFileSystem(configuration);
-      FileStatus outputStatus = fileSystem.getFileStatus(outputPath);
-      List<Footer> footers = ParquetFileReader.readAllFootersInParallel(configuration,
outputStatus);
+    Configuration configuration = ContextUtil.getConfiguration(jobContext);
+    if (configuration.getBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, true)) {
       try {
-        ParquetFileWriter.writeMetadataFile(configuration, outputPath, footers);
+        final FileSystem fileSystem = outputPath.getFileSystem(configuration);
+        FileStatus outputStatus = fileSystem.getFileStatus(outputPath);
+        List<Footer> footers = ParquetFileReader.readAllFootersInParallel(configuration,
outputStatus);
+        try {
+          ParquetFileWriter.writeMetadataFile(configuration, outputPath, footers);
+        } catch (Exception e) {
+          LOG.warn("could not write summary file for " + outputPath, e);
+          final Path metadataPath = new Path(outputPath, ParquetFileWriter.PARQUET_METADATA_FILE);
+          if (fileSystem.exists(metadataPath)) {
+            fileSystem.delete(metadataPath, true);
+          }
+        }
       } catch (Exception e) {
         LOG.warn("could not write summary file for " + outputPath, e);
-        final Path metadataPath = new Path(outputPath, ParquetFileWriter.PARQUET_METADATA_FILE);
-        if (fileSystem.exists(metadataPath)) {
-          fileSystem.delete(metadataPath, true);
-        }
       }
-    } catch (Exception e) {
-      LOG.warn("could not write summary file for " + outputPath, e);
     }
   }
 

http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/blob/be1222ef/parquet-hadoop/src/main/java/parquet/hadoop/ParquetOutputFormat.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/main/java/parquet/hadoop/ParquetOutputFormat.java b/parquet-hadoop/src/main/java/parquet/hadoop/ParquetOutputFormat.java
index 74f4051..98e73e0 100644
--- a/parquet-hadoop/src/main/java/parquet/hadoop/ParquetOutputFormat.java
+++ b/parquet-hadoop/src/main/java/parquet/hadoop/ParquetOutputFormat.java
@@ -73,6 +73,10 @@ import parquet.hadoop.util.ConfigurationUtil;
  *
  * # To enable/disable dictionary encoding
  * parquet.enable.dictionary=true # false to disable dictionary encoding
+ *
+ * # To enable/disable summary metadata aggregation at the end of a MR job
+ * # The default is true (enabled)
+ * parquet.enable.summary-metadata=true # false to disable summary aggregation
  * </pre>
  *
  * If parquet.compression is not set, the following properties are checked (FileOutputFormat
behavior).
@@ -99,6 +103,7 @@ public class ParquetOutputFormat<T> extends FileOutputFormat<Void,
T> {
   public static final String ENABLE_DICTIONARY    = "parquet.enable.dictionary";
   public static final String VALIDATION           = "parquet.validation";
   public static final String WRITER_VERSION       = "parquet.writer.version";
+  public static final String ENABLE_JOB_SUMMARY   = "parquet.enable.summary-metadata";
 
   public static void setWriteSupportClass(Job job,  Class<?> writeSupportClass) {
     getConfiguration(job).set(WRITE_SUPPORT_CLASS, writeSupportClass.getName());


Mime
View raw message