parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ziva...@apache.org
Subject parquet-mr git commit: PARQUET-1115: Warn users when misusing parquet-tools merge
Date Tue, 07 Nov 2017 14:39:56 GMT
Repository: parquet-mr
Updated Branches:
  refs/heads/master d55a572e5 -> 328c5deb0


PARQUET-1115: Warn users when misusing parquet-tools merge

Author: Nandor Kollar <nkollar@cloudera.com>

Closes #433 from nandorKollar/PARQUET-1115 and squashes the following commits:

5504a39 [Nandor Kollar] PARQUET-1115: Warn users when misusing parquet-tools merge
f2ece26 [Nandor Kollar] PARQUET-1115: Warn users when misusing parquet-tools merge
4f3ec99 [Nandor Kollar] PARQUET-1115: Warn users when misusing parquet-tools merge
f97e620 [Nandor Kollar] PARQUET-1115: Prevent users from misusing parquet-tools merge


Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/328c5deb
Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/328c5deb
Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/328c5deb

Branch: refs/heads/master
Commit: 328c5deb015ee5bc0a24623bc29225f6ec1ae23d
Parents: d55a572
Author: Nandor Kollar <nkollar@cloudera.com>
Authored: Tue Nov 7 14:37:39 2017 +0100
Committer: Zoltan Ivanfi <zi@cloudera.com>
Committed: Tue Nov 7 14:37:39 2017 +0100

----------------------------------------------------------------------
 .../java/org/apache/parquet/tools/Main.java     |  1 +
 .../parquet/tools/command/CatCommand.java       |  5 ++++
 .../apache/parquet/tools/command/Command.java   |  3 ++-
 .../parquet/tools/command/DumpCommand.java      |  7 +++++-
 .../parquet/tools/command/HeadCommand.java      |  5 ++++
 .../parquet/tools/command/MergeCommand.java     | 26 ++++++++++++++++++++
 .../parquet/tools/command/RowCountCommand.java  |  5 ++++
 .../parquet/tools/command/ShowMetaCommand.java  |  8 +++---
 .../tools/command/ShowSchemaCommand.java        |  7 ++++--
 .../parquet/tools/command/SizeCommand.java      |  5 ++++
 .../apache/parquet/tools/read/SimpleRecord.java |  7 ++++--
 .../tools/read/SimpleRecordConverter.java       | 12 ---------
 .../parquet/tools/util/MetadataUtils.java       |  3 ---
 13 files changed, 70 insertions(+), 24 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/328c5deb/parquet-tools/src/main/java/org/apache/parquet/tools/Main.java
----------------------------------------------------------------------
diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/Main.java b/parquet-tools/src/main/java/org/apache/parquet/tools/Main.java
index 3a3919b..94a6979 100644
--- a/parquet-tools/src/main/java/org/apache/parquet/tools/Main.java
+++ b/parquet-tools/src/main/java/org/apache/parquet/tools/Main.java
@@ -99,6 +99,7 @@ public class Main {
       ustr = ustr + " " + usage[0];
     }
 
+    format.printWrapped(err, WIDTH, name + ":\n" + command.getCommandDescription());
     format.printUsage(err, WIDTH, ustr);
     format.printWrapped(err, WIDTH, LEFT_PAD, "where option is one of:");
     format.printOptions(err, WIDTH, options, LEFT_PAD, DESC_PAD);

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/328c5deb/parquet-tools/src/main/java/org/apache/parquet/tools/command/CatCommand.java
----------------------------------------------------------------------
diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/CatCommand.java
b/parquet-tools/src/main/java/org/apache/parquet/tools/command/CatCommand.java
index 59af508..d6e8a36 100644
--- a/parquet-tools/src/main/java/org/apache/parquet/tools/command/CatCommand.java
+++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/CatCommand.java
@@ -60,6 +60,11 @@ public class CatCommand extends ArgsOnlyCommand {
   }
 
   @Override
+  public String getCommandDescription() {
+    return "Prints the content of a Parquet file. The output contains only the data, no metadata
is displayed";
+  }
+
+  @Override
   public Options getOptions() {
     return OPTIONS;
   }

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/328c5deb/parquet-tools/src/main/java/org/apache/parquet/tools/command/Command.java
----------------------------------------------------------------------
diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/Command.java b/parquet-tools/src/main/java/org/apache/parquet/tools/command/Command.java
index 0e23018..d83f6fc 100644
--- a/parquet-tools/src/main/java/org/apache/parquet/tools/command/Command.java
+++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/Command.java
@@ -24,7 +24,8 @@ import org.apache.commons.cli.Options;
 public interface Command {
   Options getOptions();
   boolean supportsExtraArgs();
-  public String[] getUsageDescription();
+  String[] getUsageDescription();
+  String getCommandDescription();
 
   void execute(CommandLine options) throws Exception;
 }

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/328c5deb/parquet-tools/src/main/java/org/apache/parquet/tools/command/DumpCommand.java
----------------------------------------------------------------------
diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/DumpCommand.java
b/parquet-tools/src/main/java/org/apache/parquet/tools/command/DumpCommand.java
index d5e8d94..6cb12fa 100644
--- a/parquet-tools/src/main/java/org/apache/parquet/tools/command/DumpCommand.java
+++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/DumpCommand.java
@@ -113,7 +113,12 @@ public class DumpCommand extends ArgsOnlyCommand {
         return USAGE;
     }
 
-    @Override
+  @Override
+  public String getCommandDescription() {
+    return "Prints the content and metadata of a Parquet file";
+  }
+
+  @Override
     public void execute(CommandLine options) throws Exception {
         super.execute(options);
 

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/328c5deb/parquet-tools/src/main/java/org/apache/parquet/tools/command/HeadCommand.java
----------------------------------------------------------------------
diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/HeadCommand.java
b/parquet-tools/src/main/java/org/apache/parquet/tools/command/HeadCommand.java
index b5d2c89..5b875ee 100644
--- a/parquet-tools/src/main/java/org/apache/parquet/tools/command/HeadCommand.java
+++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/HeadCommand.java
@@ -64,6 +64,11 @@ public class HeadCommand extends ArgsOnlyCommand {
   }
 
   @Override
+  public String getCommandDescription() {
+    return "Prints the first n record of the Parquet file";
+  }
+
+  @Override
   public void execute(CommandLine options) throws Exception {
     super.execute(options);
 

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/328c5deb/parquet-tools/src/main/java/org/apache/parquet/tools/command/MergeCommand.java
----------------------------------------------------------------------
diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/MergeCommand.java
b/parquet-tools/src/main/java/org/apache/parquet/tools/command/MergeCommand.java
index 73e9b44..5d79a49 100644
--- a/parquet-tools/src/main/java/org/apache/parquet/tools/command/MergeCommand.java
+++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/MergeCommand.java
@@ -26,8 +26,10 @@ import org.apache.hadoop.fs.Path;
 import org.apache.parquet.hadoop.util.HiddenFileFilter;
 import org.apache.parquet.hadoop.ParquetFileWriter;
 import org.apache.parquet.hadoop.metadata.FileMetaData;
+import org.apache.parquet.tools.Main;
 
 import java.io.IOException;
+import java.io.PrintWriter;
 import java.util.ArrayList;
 import java.util.List;
 
@@ -42,6 +44,7 @@ public class MergeCommand extends ArgsOnlyCommand {
    * Biggest number of input files we can merge.
    */
   private static final int MAX_FILE_NUM = 100;
+  private static final long TOO_SMALL_FILE_THRESHOLD = 64 * 1024 * 1024;
 
   private Configuration conf;
 
@@ -57,6 +60,14 @@ public class MergeCommand extends ArgsOnlyCommand {
   }
 
   @Override
+  public String getCommandDescription() {
+    return "Merges multiple Parquet files into one. " +
+      "The command doesn't merge row groups, just places one after the other. " +
+      "When used to merge many small files, the resulting file will still contain small row
groups, " +
+      "which usually leads to bad query performance.";
+  }
+
+  @Override
   public void execute(CommandLine options) throws Exception {
     // Prepare arguments
     List<String> args = options.getArgList();
@@ -65,14 +76,29 @@ public class MergeCommand extends ArgsOnlyCommand {
 
     // Merge schema and extraMeta
     FileMetaData mergedMeta = mergedMetadata(inputFiles);
+    PrintWriter out = new PrintWriter(Main.out, true);
 
     // Merge data
     ParquetFileWriter writer = new ParquetFileWriter(conf,
             mergedMeta.getSchema(), outputFile, ParquetFileWriter.Mode.CREATE);
     writer.start();
+    boolean tooSmallFilesMerged = false;
     for (Path input: inputFiles) {
+      if (input.getFileSystem(conf).getFileStatus(input).getLen() < TOO_SMALL_FILE_THRESHOLD)
{
+        out.format("Warning: file %s is too small, length: %d\n",
+          input,
+          input.getFileSystem(conf).getFileStatus(input).getLen());
+        tooSmallFilesMerged = true;
+      }
+
       writer.appendFile(conf, input);
     }
+
+    if (tooSmallFilesMerged) {
+      out.println("Warning: you merged too small files. " +
+        "Although the size of the merged file is bigger, it STILL contains small row groups,
thus you don't have the advantage of big row groups, " +
+        "which usually leads to bad query performance!");
+    }
     writer.end(mergedMeta.getKeyValueMetaData());
   }
 

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/328c5deb/parquet-tools/src/main/java/org/apache/parquet/tools/command/RowCountCommand.java
----------------------------------------------------------------------
diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/RowCountCommand.java
b/parquet-tools/src/main/java/org/apache/parquet/tools/command/RowCountCommand.java
index 37d6079..6005571 100644
--- a/parquet-tools/src/main/java/org/apache/parquet/tools/command/RowCountCommand.java
+++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/RowCountCommand.java
@@ -67,6 +67,11 @@ public class RowCountCommand extends ArgsOnlyCommand {
   }
 
   @Override
+  public String getCommandDescription() {
+    return "Prints the count of rows in Parquet file(s)";
+  }
+
+  @Override
   public void execute(CommandLine options) throws Exception {
     super.execute(options);
 

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/328c5deb/parquet-tools/src/main/java/org/apache/parquet/tools/command/ShowMetaCommand.java
----------------------------------------------------------------------
diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/ShowMetaCommand.java
b/parquet-tools/src/main/java/org/apache/parquet/tools/command/ShowMetaCommand.java
index 3fc8ba4..8d35551 100644
--- a/parquet-tools/src/main/java/org/apache/parquet/tools/command/ShowMetaCommand.java
+++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/ShowMetaCommand.java
@@ -18,8 +18,6 @@
  */
 package org.apache.parquet.tools.command;
 
-import static org.apache.parquet.format.converter.ParquetMetadataConverter.NO_FILTER;
-
 import org.apache.commons.cli.CommandLine;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
@@ -27,7 +25,6 @@ import org.apache.hadoop.fs.Path;
 
 import org.apache.parquet.hadoop.Footer;
 import org.apache.parquet.hadoop.ParquetFileReader;
-import org.apache.parquet.hadoop.metadata.ParquetMetadata;
 import org.apache.parquet.tools.util.MetadataUtils;
 import org.apache.parquet.tools.util.PrettyPrintWriter;
 import org.apache.parquet.tools.util.PrettyPrintWriter.WhiteSpaceHandler;
@@ -50,6 +47,11 @@ public class ShowMetaCommand extends ArgsOnlyCommand {
   }
 
   @Override
+  public String getCommandDescription() {
+    return "Prints the metadata of Parquet file(s)";
+  }
+
+  @Override
   public void execute(CommandLine options) throws Exception {
     super.execute(options);
 

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/328c5deb/parquet-tools/src/main/java/org/apache/parquet/tools/command/ShowSchemaCommand.java
----------------------------------------------------------------------
diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/ShowSchemaCommand.java
b/parquet-tools/src/main/java/org/apache/parquet/tools/command/ShowSchemaCommand.java
index 40831ba..d83e564 100644
--- a/parquet-tools/src/main/java/org/apache/parquet/tools/command/ShowSchemaCommand.java
+++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/ShowSchemaCommand.java
@@ -18,8 +18,6 @@
  */
 package org.apache.parquet.tools.command;
 
-import java.text.DecimalFormat;
-
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.Option;
 import org.apache.commons.cli.OptionBuilder;
@@ -64,6 +62,11 @@ public class ShowSchemaCommand extends ArgsOnlyCommand {
   }
 
   @Override
+  public String getCommandDescription() {
+    return "Prints the schema of Parquet file(s)";
+  }
+
+  @Override
   public Options getOptions() {
     return OPTIONS;
   }

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/328c5deb/parquet-tools/src/main/java/org/apache/parquet/tools/command/SizeCommand.java
----------------------------------------------------------------------
diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/SizeCommand.java
b/parquet-tools/src/main/java/org/apache/parquet/tools/command/SizeCommand.java
index bcc6704..7c4665d 100644
--- a/parquet-tools/src/main/java/org/apache/parquet/tools/command/SizeCommand.java
+++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/SizeCommand.java
@@ -82,6 +82,11 @@ public class SizeCommand extends ArgsOnlyCommand {
   }
 
   @Override
+  public String getCommandDescription() {
+    return "Prints the size of Parquet file(s)";
+  }
+
+  @Override
   public void execute(CommandLine options) throws Exception {
     super.execute(options);
 

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/328c5deb/parquet-tools/src/main/java/org/apache/parquet/tools/read/SimpleRecord.java
----------------------------------------------------------------------
diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/read/SimpleRecord.java b/parquet-tools/src/main/java/org/apache/parquet/tools/read/SimpleRecord.java
index 39c1ce0..5585419 100644
--- a/parquet-tools/src/main/java/org/apache/parquet/tools/read/SimpleRecord.java
+++ b/parquet-tools/src/main/java/org/apache/parquet/tools/read/SimpleRecord.java
@@ -20,7 +20,11 @@ package org.apache.parquet.tools.read;
 
 import java.io.IOException;
 import java.io.PrintWriter;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
 
 import com.google.common.base.Strings;
 import com.google.common.collect.Maps;
@@ -28,7 +32,6 @@ import org.codehaus.jackson.map.ObjectMapper;
 import org.codehaus.jackson.node.BinaryNode;
 
 public class SimpleRecord {
-  public static final int TAB_SIZE = 2;
   protected final List<NameValue> values;
 
   public SimpleRecord() {

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/328c5deb/parquet-tools/src/main/java/org/apache/parquet/tools/read/SimpleRecordConverter.java
----------------------------------------------------------------------
diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/read/SimpleRecordConverter.java
b/parquet-tools/src/main/java/org/apache/parquet/tools/read/SimpleRecordConverter.java
index 29bb44f..a119a34 100644
--- a/parquet-tools/src/main/java/org/apache/parquet/tools/read/SimpleRecordConverter.java
+++ b/parquet-tools/src/main/java/org/apache/parquet/tools/read/SimpleRecordConverter.java
@@ -18,10 +18,6 @@
  */
 package org.apache.parquet.tools.read;
 
-import java.nio.CharBuffer;
-import java.nio.charset.Charset;
-import java.nio.charset.CharsetDecoder;
-
 import java.math.BigDecimal;
 import java.math.BigInteger;
 
@@ -33,15 +29,7 @@ import org.apache.parquet.schema.GroupType;
 import org.apache.parquet.schema.OriginalType;
 import org.apache.parquet.schema.Type;
 
-/**
- * 
- * 
- * @author 
- */
 public class SimpleRecordConverter extends GroupConverter {
-  private static final Charset UTF8 = Charset.forName("UTF-8");
-  private static final CharsetDecoder UTF8_DECODER = UTF8.newDecoder();
-
   private final Converter converters[];
   private final String name;
   private final SimpleRecordConverter parent;

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/328c5deb/parquet-tools/src/main/java/org/apache/parquet/tools/util/MetadataUtils.java
----------------------------------------------------------------------
diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/util/MetadataUtils.java
b/parquet-tools/src/main/java/org/apache/parquet/tools/util/MetadataUtils.java
index 7758134..b7fb0e6 100644
--- a/parquet-tools/src/main/java/org/apache/parquet/tools/util/MetadataUtils.java
+++ b/parquet-tools/src/main/java/org/apache/parquet/tools/util/MetadataUtils.java
@@ -19,9 +19,6 @@
 package org.apache.parquet.tools.util;
 
 import java.util.ArrayList;
-import java.util.Arrays;
-
-import java.util.HashMap;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;


Mime
View raw message