parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ziva...@apache.org
Subject [parquet-mr] branch column-indexes updated: PARQUET-1212: Column indexes: Show indexes in tools (#479)
Date Tue, 22 May 2018 13:19:16 GMT
This is an automated email from the ASF dual-hosted git repository.

zivanfi pushed a commit to branch column-indexes
in repository https://gitbox.apache.org/repos/asf/parquet-mr.git


The following commit(s) were added to refs/heads/column-indexes by this push:
     new 6165a0c  PARQUET-1212: Column indexes: Show indexes in tools (#479)
6165a0c is described below

commit 6165a0c4ee695708562b7787d484d48fdd6eb074
Author: Gabor Szadovszky <gabor@apache.org>
AuthorDate: Tue May 22 15:19:12 2018 +0200

    PARQUET-1212: Column indexes: Show indexes in tools (#479)
---
 .../src/main/java/org/apache/parquet/cli/Main.java |   2 +
 .../cli/commands/ShowColumnIndexCommand.java       | 166 ++++++++++++++++++
 .../parquet/tools/command/ColumnIndexCommand.java  | 190 +++++++++++++++++++++
 .../org/apache/parquet/tools/command/Registry.java |   1 +
 4 files changed, 359 insertions(+)

diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java
index 990193c..fa69ce7 100644
--- a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java
+++ b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java
@@ -32,6 +32,7 @@ import org.apache.parquet.cli.commands.ConvertCSVCommand;
 import org.apache.parquet.cli.commands.ConvertCommand;
 import org.apache.parquet.cli.commands.ParquetMetadataCommand;
 import org.apache.parquet.cli.commands.SchemaCommand;
+import org.apache.parquet.cli.commands.ShowColumnIndexCommand;
 import org.apache.parquet.cli.commands.ShowDictionaryCommand;
 import org.apache.parquet.cli.commands.ShowPagesCommand;
 import org.apache.parquet.cli.commands.ToAvroCommand;
@@ -87,6 +88,7 @@ public class Main extends Configured implements Tool {
     jc.addCommand("to-avro", new ToAvroCommand(console));
     jc.addCommand("cat", new CatCommand(console, 0));
     jc.addCommand("head", new CatCommand(console, 10));
+    jc.addCommand("column-index", new ShowColumnIndexCommand(console));
   }
 
   @Override
diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowColumnIndexCommand.java
b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowColumnIndexCommand.java
new file mode 100644
index 0000000..0407a8d
--- /dev/null
+++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowColumnIndexCommand.java
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.cli.commands;
+
+import java.io.IOException;
+import java.util.AbstractMap;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.parquet.cli.BaseCommand;
+import org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.parquet.hadoop.metadata.BlockMetaData;
+import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+import org.apache.parquet.hadoop.util.HadoopInputFile;
+import org.apache.parquet.internal.column.columnindex.ColumnIndex;
+import org.apache.parquet.internal.column.columnindex.OffsetIndex;
+import org.apache.parquet.io.InputFile;
+import org.slf4j.Logger;
+
+import com.beust.jcommander.Parameter;
+import com.beust.jcommander.Parameters;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
+
+/**
+ * parquet-cli command to print column and offset indexes.
+ */
+@Parameters(commandDescription = "Prints the column and offset indexes of a Parquet file")
+public class ShowColumnIndexCommand extends BaseCommand {
+  public ShowColumnIndexCommand(Logger console) {
+    super(console);
+  }
+
+  @Parameter(description = "<parquet path>")
+  List<String> files;
+
+  @Parameter(names = { "-c", "--column" }, description = "Shows the column/offset indexes
for the given column only")
+  List<String> ColumnPaths;
+
+  @Parameter(names = { "-b",
+      "--block" }, description = "Shows the column/offset indexes for the given block (row-group)
only; "
+          + "blocks are referenced by their indexes from 0")
+  List<String> blockIndexes;
+
+  @Parameter(names = { "-i", "--column-index" }, description = "Shows the column indexes;
"
+      + "active by default unless -o is used")
+  boolean showColumnIndex;
+
+  @Parameter(names = { "-o", "--offset-index" }, description = "Shows the offset indexes;
"
+      + "active by default unless -i is used")
+  boolean showOffsetIndex;
+
+  @Override
+  public List<String> getExamples() {
+    return Lists.newArrayList(
+        "# Show only column indexes for column 'col' from a Parquet file",
+        "-c col -i sample.parquet");
+  }
+
+  @Override
+  public int run() throws IOException {
+    Preconditions.checkArgument(files != null && files.size() >= 1,
+        "A Parquet file is required.");
+    Preconditions.checkArgument(files.size() == 1,
+        "Cannot process multiple Parquet files.");
+
+    InputFile in = HadoopInputFile.fromPath(new Path(files.get(0)), new Configuration());
+    if (!showColumnIndex && !showOffsetIndex) {
+      showColumnIndex = showOffsetIndex = true;
+    }
+
+    try (ParquetFileReader reader = ParquetFileReader.open(in)) {
+      boolean firstBlock = true;
+      for (Entry<Integer, BlockMetaData> entry : getBlocks(reader.getFooter())) {
+        if (!firstBlock) {
+          console.info("");
+        }
+        firstBlock = false;
+        console.info("row group {}:", entry.getKey());
+        for (ColumnChunkMetaData column : getColumns(entry.getValue())) {
+          String path = column.getPath().toDotString();
+          if (showColumnIndex) {
+            console.info("column index for column {}:", path);
+            ColumnIndex columnIndex = reader.readColumnIndex(column);
+            if (columnIndex == null) {
+              console.info("NONE");
+            } else {
+              console.info(columnIndex.toString());
+            }
+          }
+          if (showOffsetIndex) {
+            console.info("offset index for column {}:", path);
+            OffsetIndex offsetIndex = reader.readOffsetIndex(column);
+            if (offsetIndex == null) {
+              console.info("NONE");
+            } else {
+              console.info(offsetIndex.toString());
+            }
+          }
+        }
+      }
+    }
+    return 0;
+  }
+
+  // Returns the index-block pairs based on the arguments of --block
+  private List<Entry<Integer, BlockMetaData>> getBlocks(ParquetMetadata meta)
{
+    List<BlockMetaData> blocks = meta.getBlocks();
+    List<Entry<Integer, BlockMetaData>> pairs = new ArrayList<>();
+    if (blockIndexes == null || blockIndexes.isEmpty()) {
+      int index = 0;
+      for (BlockMetaData block : blocks) {
+        pairs.add(new AbstractMap.SimpleImmutableEntry<>(index++, block));
+      }
+    } else {
+      for (String indexStr : blockIndexes) {
+        int index = Integer.parseInt(indexStr);
+        pairs.add(new AbstractMap.SimpleImmutableEntry<>(index, blocks.get(index)));
+      }
+    }
+    return pairs;
+  }
+
+  private List<ColumnChunkMetaData> getColumns(BlockMetaData block) {
+    List<ColumnChunkMetaData> columns = block.getColumns();
+    if (ColumnPaths == null || ColumnPaths.isEmpty()) {
+      return columns;
+    }
+    Map<String, ColumnChunkMetaData> pathMap = new HashMap<>();
+    for (ColumnChunkMetaData column : columns) {
+      pathMap.put(column.getPath().toDotString(), column);
+    }
+
+    List<ColumnChunkMetaData> filtered = new ArrayList<>();
+    for (String path : ColumnPaths) {
+      ColumnChunkMetaData column = pathMap.get(path);
+      if (column != null) {
+        filtered.add(column);
+      }
+    }
+    return filtered;
+  }
+
+}
diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/ColumnIndexCommand.java
b/parquet-tools/src/main/java/org/apache/parquet/tools/command/ColumnIndexCommand.java
new file mode 100644
index 0000000..f31599a
--- /dev/null
+++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/ColumnIndexCommand.java
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.tools.command;
+
+import java.io.PrintWriter;
+import java.util.AbstractMap;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.parquet.internal.column.columnindex.ColumnIndex;
+import org.apache.parquet.internal.column.columnindex.OffsetIndex;
+import org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.parquet.hadoop.metadata.BlockMetaData;
+import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+import org.apache.parquet.hadoop.util.HadoopInputFile;
+import org.apache.parquet.io.InputFile;
+import org.apache.parquet.tools.Main;
+
+/**
+ * parquet-tools command to print column and offset indexes.
+ */
+public class ColumnIndexCommand extends ArgsOnlyCommand {
+  public static final String[] USAGE = new String[] {
+      "<input>",
+      "where <input> is the parquet file to print the column and offset indexes for"
+  };
+
+  public static final Options OPTIONS;
+  static {
+    OPTIONS = new Options();
+    OPTIONS.addOption(Option.builder("c")
+        .longOpt("column")
+        .desc("Shows the column/offset indexes for the given column only; "
+            + "multiple columns shall be separated by commas")
+        .hasArg()
+        .build());
+    OPTIONS.addOption(Option.builder("b")
+        .longOpt("block")
+        .desc("Shows the column/offset indexes for the given block (row-group) only; "
+            + "multiple blocks shall be speparated by commas; "
+            + "blocks are referenced by their indexes from 0")
+        .hasArg()
+        .build());
+    OPTIONS.addOption(Option.builder("i")
+        .longOpt("column-index")
+        .desc("Shows the column indexes; "
+            + "active by default unless -o is used")
+        .hasArg(false)
+        .build());
+    OPTIONS.addOption(Option.builder("o")
+        .longOpt("offset-index")
+        .desc("Shows the offset indexes; "
+            + "active by default unless -i is used")
+        .hasArg(false)
+        .build());
+  }
+
+  public ColumnIndexCommand() {
+    super(1, 1);
+  }
+
+  @Override
+  public String[] getUsageDescription() {
+    return USAGE;
+  }
+
+  @Override
+  public String getCommandDescription() {
+    return "Prints the column and offset indexes of a Parquet file.";
+  }
+
+  @Override
+  public Options getOptions() {
+    return OPTIONS;
+  }
+
+  @Override
+  public void execute(CommandLine options) throws Exception {
+    super.execute(options);
+
+    String[] args = options.getArgs();
+    InputFile in = HadoopInputFile.fromPath(new Path(args[0]), new Configuration());
+    PrintWriter out = new PrintWriter(Main.out, true);
+    String blockValue = options.getOptionValue("b");
+    String[] indexes = blockValue == null ? null : blockValue.split("\\s*,\\s*");
+    boolean showColumnIndex = options.hasOption("i");
+    boolean showOffsetIndex = options.hasOption("o");
+    if (!showColumnIndex && !showOffsetIndex) {
+      showColumnIndex = showOffsetIndex = true;
+    }
+
+    try (ParquetFileReader reader = ParquetFileReader.open(in)) {
+      boolean firstBlock = true;
+      for (Entry<Integer, BlockMetaData> entry : getBlocks(reader.getFooter(), indexes))
{
+        if (!firstBlock) {
+          out.println();
+        }
+        firstBlock = false;
+        out.format("row group %d:%n", entry.getKey());
+        for (ColumnChunkMetaData column : getColumns(entry.getValue(), options)) {
+          String path = column.getPath().toDotString();
+          if (showColumnIndex) {
+            out.format("column index for column %s:%n", path);
+            ColumnIndex columnIndex = reader.readColumnIndex(column);
+            if (columnIndex == null) {
+              out.println("NONE");
+            } else {
+              out.println(columnIndex);
+            }
+          }
+          if (showOffsetIndex) {
+            out.format("offset index for column %s:%n", path);
+            OffsetIndex offsetIndex = reader.readOffsetIndex(column);
+            if (offsetIndex == null) {
+              out.println("NONE");
+            } else {
+              out.println(offsetIndex);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Returns the index-block pairs based on the arguments of --block
+  private static List<Entry<Integer, BlockMetaData>> getBlocks(ParquetMetadata
meta, String[] indexes) {
+    List<BlockMetaData> blocks = meta.getBlocks();
+    List<Entry<Integer, BlockMetaData>> pairs = new ArrayList<>();
+    if (indexes == null) {
+      int index = 0;
+      for (BlockMetaData block : blocks) {
+        pairs.add(new AbstractMap.SimpleImmutableEntry<>(index++, block));
+      }
+    } else {
+      for (String indexStr : indexes) {
+        int index = Integer.parseInt(indexStr);
+        pairs.add(new AbstractMap.SimpleImmutableEntry<>(index, blocks.get(index)));
+      }
+    }
+    return pairs;
+  }
+
+  private static List<ColumnChunkMetaData> getColumns(BlockMetaData block, CommandLine
options) {
+    List<ColumnChunkMetaData> columns = block.getColumns();
+    String pathValue = options.getOptionValue("c");
+    if (pathValue == null) {
+      return columns;
+    }
+    String[] paths = pathValue.split("\\s*,\\s*");
+    Map<String, ColumnChunkMetaData> pathMap = new HashMap<>();
+    for (ColumnChunkMetaData column : columns) {
+      pathMap.put(column.getPath().toDotString(), column);
+    }
+
+    List<ColumnChunkMetaData> filtered = new ArrayList<>();
+    for (String path : paths) {
+      ColumnChunkMetaData column = pathMap.get(path);
+      if (column != null) {
+        filtered.add(column);
+      }
+    }
+    return filtered;
+  }
+
+}
diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/Registry.java b/parquet-tools/src/main/java/org/apache/parquet/tools/command/Registry.java
index 6df84be..399efb7 100644
--- a/parquet-tools/src/main/java/org/apache/parquet/tools/command/Registry.java
+++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/Registry.java
@@ -34,6 +34,7 @@ public final class Registry {
     registry.put("merge", MergeCommand.class);
     registry.put("rowcount", RowCountCommand.class);
     registry.put("size", SizeCommand.class);
+    registry.put("column-index", ColumnIndexCommand.class);
   }
 
   public static Map<String,Command> allCommands() {

-- 
To stop receiving notification emails like this one, please contact
zivanfi@apache.org.

Mime
View raw message