parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From b...@apache.org
Subject parquet-mr git commit: PARQUET-367: "parquet-cat -j" doesn't show all records.
Date Thu, 05 May 2016 20:56:56 GMT
Repository: parquet-mr
Updated Branches:
  refs/heads/master c3f3830f7 -> da69d4b76


PARQUET-367: "parquet-cat -j" doesn't show all records.

Added JsonRecordFormatter which formats SimpleRecords into an structure that can be used with
ObjectMapper to create a valid json structure. Unit test included.

Author: Reuben Kuhnert <reuben.kuhnert@cloudera.com>

Closes #281 from sircodesalotOfTheRound/fix-parquet-cat and squashes the following commits:

67207ef [Reuben Kuhnert] PARQUET-367: "parquet-cat -j" doesn't show all records.


Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/da69d4b7
Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/da69d4b7
Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/da69d4b7

Branch: refs/heads/master
Commit: da69d4b764f4d13d38a4f7fe7462ef0c7d17c619
Parents: c3f3830
Author: Reuben Kuhnert <reuben.kuhnert@cloudera.com>
Authored: Thu May 5 13:56:53 2016 -0700
Committer: Ryan Blue <blue@apache.org>
Committed: Thu May 5 13:56:53 2016 -0700

----------------------------------------------------------------------
 .../parquet/tools/command/CatCommand.java       |   9 +-
 .../parquet/tools/json/JsonRecordFormatter.java | 132 +++++++++++
 .../apache/parquet/tools/read/SimpleRecord.java |   1 +
 .../tools/read/TestJsonRecordFormatter.java     | 231 +++++++++++++++++++
 4 files changed, 372 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/da69d4b7/parquet-tools/src/main/java/org/apache/parquet/tools/command/CatCommand.java
----------------------------------------------------------------------
diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/CatCommand.java
b/parquet-tools/src/main/java/org/apache/parquet/tools/command/CatCommand.java
index b988eca..59af508 100644
--- a/parquet-tools/src/main/java/org/apache/parquet/tools/command/CatCommand.java
+++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/CatCommand.java
@@ -24,12 +24,16 @@ import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.Option;
 import org.apache.commons.cli.OptionBuilder;
 import org.apache.commons.cli.Options;
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 
+import org.apache.parquet.hadoop.ParquetFileReader;
 import org.apache.parquet.hadoop.ParquetReader;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
 import org.apache.parquet.tools.Main;
 import org.apache.parquet.tools.read.SimpleReadSupport;
 import org.apache.parquet.tools.read.SimpleRecord;
+import org.apache.parquet.tools.json.JsonRecordFormatter;
 
 public class CatCommand extends ArgsOnlyCommand {
   public static final String[] USAGE = new String[] {
@@ -71,9 +75,12 @@ public class CatCommand extends ArgsOnlyCommand {
     try {
       PrintWriter writer = new PrintWriter(Main.out, true);
       reader = ParquetReader.builder(new SimpleReadSupport(), new Path(input)).build();
+      ParquetMetadata metadata = ParquetFileReader.readFooter(new Configuration(), new Path(input));
+      JsonRecordFormatter.JsonGroupFormatter formatter = JsonRecordFormatter.fromSchema(metadata.getFileMetaData().getSchema());
+
       for (SimpleRecord value = reader.read(); value != null; value = reader.read()) {
         if (options.hasOption('j')) {
-          value.prettyPrintJson(writer);
+          writer.write(formatter.formatRecord(value));
         } else {
           value.prettyPrint(writer);
         }

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/da69d4b7/parquet-tools/src/main/java/org/apache/parquet/tools/json/JsonRecordFormatter.java
----------------------------------------------------------------------
diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/json/JsonRecordFormatter.java
b/parquet-tools/src/main/java/org/apache/parquet/tools/json/JsonRecordFormatter.java
new file mode 100644
index 0000000..89a508a
--- /dev/null
+++ b/parquet-tools/src/main/java/org/apache/parquet/tools/json/JsonRecordFormatter.java
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.parquet.tools.json;
+
+import org.apache.parquet.schema.GroupType;
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.Type;
+import org.apache.parquet.tools.read.SimpleRecord;
+import org.codehaus.jackson.map.ObjectMapper;
+
+import java.io.IOException;
+import java.util.*;
+
+public abstract class JsonRecordFormatter<T> {
+  private static final int SINGLE_VALUE = 0;
+
+  public static class JsonPrimitiveWriter extends JsonRecordFormatter<Object> {
+
+    public JsonPrimitiveWriter(Type primitiveType) {
+      super(primitiveType);
+    }
+
+    @Override
+    protected Object formatResults(List<Object> listOfValues) {
+      if (super.typeInfo.getRepetition() == Type.Repetition.REPEATED) {
+        return listOfValues;
+      } else {
+        return listOfValues.get(SINGLE_VALUE);
+      }
+    }
+  }
+
+  public static class JsonGroupFormatter extends JsonRecordFormatter<SimpleRecord>
{
+    private final Map<String, JsonRecordFormatter> formatters;
+
+    public JsonGroupFormatter(GroupType schema) {
+      super(schema);
+
+      this.formatters = buildWriters(schema);
+    }
+
+    private Map<String, JsonRecordFormatter> buildWriters(GroupType groupSchema) {
+      Map<String, JsonRecordFormatter> writers = new LinkedHashMap<String, JsonRecordFormatter>();
+      for (Type type : groupSchema.getFields()) {
+        if (type.isPrimitive()) {
+          writers.put(type.getName(), new JsonPrimitiveWriter(type));
+        } else {
+          writers.put(type.getName(), new JsonGroupFormatter((GroupType) type));
+        }
+      }
+
+      return writers;
+    }
+
+    private Object add(SimpleRecord record) {
+      return formatEntries(collateEntries(record));
+    }
+
+    private Map<String, List<Object>> collateEntries(SimpleRecord record) {
+      Map<String, List<Object>> collatedEntries = new LinkedHashMap<String,
List<Object>>();
+      for (SimpleRecord.NameValue value : record.getValues()) {
+        if (collatedEntries.containsKey(value.getName())) {
+          collatedEntries.get(value.getName()).add(value.getValue());
+        } else {
+          List<Object> newResultListForKey = new ArrayList<Object>();
+          newResultListForKey.add(value.getValue());
+          collatedEntries.put(value.getName(), newResultListForKey);
+        }
+      }
+
+      return collatedEntries;
+    }
+
+    private Object formatEntries(Map<String, List<Object>> entries) {
+      Map<String, Object> results = new LinkedHashMap<String, Object>();
+      for (Map.Entry<String, List<Object>> entry : entries.entrySet()) {
+        JsonRecordFormatter formatter = formatters.get(entry.getKey());
+        results.put(entry.getKey(), formatter.formatResults(entry.getValue()));
+      }
+
+      return results;
+    }
+
+    @Override
+    protected Object formatResults(List<SimpleRecord> values) {
+      if (super.typeInfo.getRepetition() == Type.Repetition.REPEATED) {
+        List<Object> results = new ArrayList<Object>();
+        for (SimpleRecord object : values) {
+          results.add(add(object));
+        }
+
+        return results;
+      } else {
+        return add(values.get(SINGLE_VALUE));
+      }
+    }
+
+    public String formatRecord(SimpleRecord value) throws IOException {
+      ObjectMapper mapper = new ObjectMapper();
+      return mapper.writeValueAsString(add(value));
+    }
+  }
+
+  protected final Type typeInfo;
+
+  protected JsonRecordFormatter(Type type) {
+    this.typeInfo = type;
+  }
+
+  protected abstract Object formatResults(List<T> values);
+
+  public static JsonGroupFormatter fromSchema(MessageType messageType) {
+    return new JsonGroupFormatter(messageType);
+  }
+}

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/da69d4b7/parquet-tools/src/main/java/org/apache/parquet/tools/read/SimpleRecord.java
----------------------------------------------------------------------
diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/read/SimpleRecord.java b/parquet-tools/src/main/java/org/apache/parquet/tools/read/SimpleRecord.java
index 5f97c88..39c1ce0 100644
--- a/parquet-tools/src/main/java/org/apache/parquet/tools/read/SimpleRecord.java
+++ b/parquet-tools/src/main/java/org/apache/parquet/tools/read/SimpleRecord.java
@@ -111,6 +111,7 @@ public class SimpleRecord {
     for (NameValue value : values) {
       result.put(value.getName(), toJsonValue(value.getValue()));
     }
+
     return result;
   }
 

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/da69d4b7/parquet-tools/src/test/java/org/apache/parquet/tools/read/TestJsonRecordFormatter.java
----------------------------------------------------------------------
diff --git a/parquet-tools/src/test/java/org/apache/parquet/tools/read/TestJsonRecordFormatter.java
b/parquet-tools/src/test/java/org/apache/parquet/tools/read/TestJsonRecordFormatter.java
new file mode 100644
index 0000000..17b8ef2
--- /dev/null
+++ b/parquet-tools/src/test/java/org/apache/parquet/tools/read/TestJsonRecordFormatter.java
@@ -0,0 +1,231 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.parquet.tools.read;
+
+import org.apache.parquet.schema.GroupType;
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.PrimitiveType;
+import org.apache.parquet.schema.Type;
+import org.apache.parquet.tools.json.JsonRecordFormatter;
+import org.codehaus.jackson.map.ObjectMapper;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+
+import static org.junit.Assert.assertEquals;
+
+public class TestJsonRecordFormatter {
+  private <T> List<T> array(T... objects) {
+    return Arrays.asList(objects);
+  }
+
+  private <T> Map.Entry<String, T> entry(final String key, final T value) {
+    return new Map.Entry<String, T>() {
+      @Override
+      public String getKey() {
+        return key;
+      }
+
+      @Override
+      public T getValue() {
+        return value;
+      }
+
+      @Override
+      public T setValue(T value) {
+        throw new UnsupportedOperationException();
+      }
+    };
+  }
+
+  private Map<String, ?> obj(Map.Entry<String, ?>... entries) throws IOException
{
+    Map<String, Object> entriesAsMap = new LinkedHashMap<String, Object>();
+    for (Map.Entry<String, ?> entry : entries) {
+      entriesAsMap.put(entry.getKey(), entry.getValue());
+    }
+
+    return entriesAsMap;
+  }
+
+  private SimpleRecord.NameValue kv(String name, Object value) {
+    return new SimpleRecord.NameValue(name, value);
+  }
+
+  private String asJsonString(Object object) throws IOException {
+    ObjectMapper mapper = new ObjectMapper();
+    return mapper.writeValueAsString(object);
+  }
+
+  @Test
+  public void testFlatSchemaWithArrays() throws Exception {
+    SimpleRecord simple = new SimpleRecord();
+    MessageType schema = new MessageType("schema",
+      new PrimitiveType(Type.Repetition.REQUIRED, PrimitiveType.PrimitiveTypeName.BINARY,
"reqd"),
+      new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveType.PrimitiveTypeName.DOUBLE,
"opt"),
+      new PrimitiveType(Type.Repetition.REPEATED, PrimitiveType.PrimitiveTypeName.INT32,
"odd"),
+      new PrimitiveType(Type.Repetition.REPEATED, PrimitiveType.PrimitiveTypeName.INT64,
"even")
+    );
+
+    simple.values.add(kv("reqd", "a required value"));
+    simple.values.add(kv("opt", 1.2345));
+
+    simple.values.add(kv("odd", 1));
+    simple.values.add(kv("odd", 3));
+    simple.values.add(kv("odd", 5));
+    simple.values.add(kv("odd", 7));
+    simple.values.add(kv("odd", 9));
+
+    simple.values.add(kv("even", 2));
+    simple.values.add(kv("even", 4));
+    simple.values.add(kv("even", 6));
+    simple.values.add(kv("even", 8));
+    simple.values.add(kv("even", 10));
+
+    String expected = asJsonString(
+      obj(
+        entry("reqd", "a required value"),
+        entry("opt", 1.2345),
+        entry("odd", array(1, 3, 5, 7, 9)),
+        entry("even", array(2, 4, 6, 8, 10))
+      )
+    );
+
+    String actual = JsonRecordFormatter
+      .fromSchema(schema)
+      .formatRecord(simple);
+
+    assertEquals(expected, actual);
+  }
+
+  @Test
+  public void testNestedGrouping() throws Exception {
+    SimpleRecord simple = new SimpleRecord();
+    MessageType schema = new MessageType("schema",
+      new PrimitiveType(Type.Repetition.REPEATED, PrimitiveType.PrimitiveTypeName.BINARY,
"flat-string"),
+      new GroupType(Type.Repetition.OPTIONAL, "subgroup",
+        new PrimitiveType(Type.Repetition.REQUIRED, PrimitiveType.PrimitiveTypeName.INT32,
"flat-int"),
+        new PrimitiveType(Type.Repetition.REPEATED, PrimitiveType.PrimitiveTypeName.BINARY,
"string-list")
+      )
+    );
+
+    SimpleRecord subgroup = new SimpleRecord();
+    subgroup.values.add(kv("flat-int", 12345));
+    subgroup.values.add(kv("string-list", "two"));
+    subgroup.values.add(kv("string-list", "four"));
+    subgroup.values.add(kv("string-list", "six"));
+    subgroup.values.add(kv("string-list", "eight"));
+    subgroup.values.add(kv("string-list", "ten"));
+
+    simple.values.add(kv("flat-string", "one"));
+    simple.values.add(kv("flat-string", "two"));
+    simple.values.add(kv("flat-string", "three"));
+    simple.values.add(kv("flat-string", "four"));
+    simple.values.add(kv("flat-string", "five"));
+
+    simple.values.add(kv("subgroup", subgroup));
+
+    String actual = JsonRecordFormatter
+      .fromSchema(schema)
+      .formatRecord(simple);
+
+    String expected = asJsonString(
+      obj(
+        entry("flat-string", array("one", "two", "three", "four", "five")),
+        entry("subgroup",
+          obj(
+            entry("flat-int", 12345),
+            entry("string-list", array("two", "four", "six", "eight", "ten"))
+          )
+        )
+      )
+    );
+
+    assertEquals(expected, actual);
+  }
+
+  @Test
+  public void testGroupList() throws Exception {
+    SimpleRecord simple = new SimpleRecord();
+    MessageType schema = new MessageType("schema",
+      new GroupType(Type.Repetition.REPEATED, "repeat-group",
+        new PrimitiveType(Type.Repetition.REQUIRED, PrimitiveType.PrimitiveTypeName.INT64,
"flat-int"),
+        new PrimitiveType(Type.Repetition.REPEATED, PrimitiveType.PrimitiveTypeName.DOUBLE,
"repeat-double")
+      )
+    );
+
+    SimpleRecord repeatGroup = new SimpleRecord();
+    repeatGroup.values.add(kv("flat-int", 76543));
+    repeatGroup.values.add(kv("repeat-double", 1.2345));
+    repeatGroup.values.add(kv("repeat-double", 5.6789));
+    repeatGroup.values.add(kv("repeat-double", 10.11121314));
+    repeatGroup.values.add(kv("repeat-double", 0.4321));
+    repeatGroup.values.add(kv("repeat-double", 7.6543));
+    simple.values.add(kv("repeat-group", repeatGroup));
+
+    repeatGroup = new SimpleRecord();
+    repeatGroup.values.add(kv("flat-int", 12345));
+    repeatGroup.values.add(kv("repeat-double", 1.1));
+    repeatGroup.values.add(kv("repeat-double", 1.2));
+    repeatGroup.values.add(kv("repeat-double", 1.3));
+    repeatGroup.values.add(kv("repeat-double", 1.4));
+    repeatGroup.values.add(kv("repeat-double", 1.5));
+    simple.values.add(kv("repeat-group", repeatGroup));
+
+    repeatGroup = new SimpleRecord();
+    repeatGroup.values.add(kv("flat-int", 10293));
+    repeatGroup.values.add(kv("repeat-double", 9.5));
+    repeatGroup.values.add(kv("repeat-double", 9.4));
+    repeatGroup.values.add(kv("repeat-double", 9.3));
+    repeatGroup.values.add(kv("repeat-double", 9.2));
+    repeatGroup.values.add(kv("repeat-double", 9.1));
+    simple.values.add(kv("repeat-group", repeatGroup));
+
+    String actual = JsonRecordFormatter
+      .fromSchema(schema)
+      .formatRecord(simple);
+
+    String expected = asJsonString(
+      obj(
+        entry("repeat-group",
+          array(
+            obj(
+              entry("flat-int", 76543),
+              entry("repeat-double", array(1.2345, 5.6789, 10.11121314, 0.4321, 7.6543))
+            ),
+            obj(
+              entry("flat-int", 12345),
+              entry("repeat-double", array(1.1, 1.2, 1.3, 1.4, 1.5))
+            ),
+            obj(
+              entry("flat-int", 10293),
+              entry("repeat-double", array(9.5, 9.4, 9.3, 9.2, 9.1))
+            )
+          )
+        )
+      )
+    );
+
+    assertEquals(expected, actual);
+  }
+}


Mime
View raw message