parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From b...@apache.org
Subject parquet-mr git commit: PARQUET-548: Add EncodingStats.
Date Sat, 23 Apr 2016 00:39:58 GMT
Repository: parquet-mr
Updated Branches:
  refs/heads/master 8bcfe6c55 -> 3dd2210e7


PARQUET-548: Add EncodingStats.

This adds `EncodingStats`, which tracks the number of pages for each encoding, separated into
dictionary and data pages. It also adds convenience functions that are useful for dictionary
filtering, like `hasDictionaryEncodedPages` and `hasNonDictionaryEncodedPages`.

`EncodingStats` have a unit test in parquet-column and an integration test in parquet-hadoop
that writes a file and verifies the stats are present and correct when it is read.

This includes commits from #330 because it updates the dictionary filter. I'll rebase and
remove them once it is merged.

Author: Ryan Blue <blue@apache.org>

Closes #332 from rdblue/PARQUET-548-add-encoding-stats and squashes the following commits:

5f148e6 [Ryan Blue] PARQUET-548: Fixes for review comments.
dc332d3 [Ryan Blue] PARQUET-548: Add EncodingStats.


Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/3dd2210e
Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/3dd2210e
Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/3dd2210e

Branch: refs/heads/master
Commit: 3dd2210e79a8eb84378c370b32652f9a53f87a93
Parents: 8bcfe6c
Author: Ryan Blue <blue@apache.org>
Authored: Fri Apr 22 17:39:52 2016 -0700
Committer: Ryan Blue <blue@apache.org>
Committed: Fri Apr 22 17:39:52 2016 -0700

----------------------------------------------------------------------
 .../apache/parquet/column/EncodingStats.java    | 162 +++++++++++++++
 .../parquet/column/TestEncodingStats.java       | 202 +++++++++++++++++++
 .../dictionarylevel/DictionaryFilter.java       |   6 +
 .../converter/ParquetMetadataConverter.java     |  50 +++++
 .../hadoop/ColumnChunkPageWriteStore.java       |  25 ++-
 .../parquet/hadoop/DictionaryPageReader.java    |   7 +
 .../parquet/hadoop/ParquetFileWriter.java       |  31 ++-
 .../hadoop/metadata/ColumnChunkMetaData.java    |  67 +++---
 .../hadoop/TestReadWriteEncodingStats.java      | 121 +++++++++++
 9 files changed, 628 insertions(+), 43 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/3dd2210e/parquet-column/src/main/java/org/apache/parquet/column/EncodingStats.java
----------------------------------------------------------------------
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/EncodingStats.java b/parquet-column/src/main/java/org/apache/parquet/column/EncodingStats.java
new file mode 100644
index 0000000..a8b95f8
--- /dev/null
+++ b/parquet-column/src/main/java/org/apache/parquet/column/EncodingStats.java
@@ -0,0 +1,162 @@
+/*
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ */
+
+package org.apache.parquet.column;
+
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.Set;
+
+import static org.apache.parquet.column.Encoding.PLAIN_DICTIONARY;
+import static org.apache.parquet.column.Encoding.RLE_DICTIONARY;
+
+/**
+ * EncodingStats track dictionary and data page encodings for a single column within a row
group.
+ * These are used when filtering row groups. For example, to filter a row group based on
a column's
+ * dictionary, all of the data pages in that column must be dictionary-encoded. This class
provides
+ * convenience methods for those checks, like {@link #hasNonDictionaryEncodedPages()}.
+ */
+public class EncodingStats {
+  final Map<Encoding, Integer> dictStats;
+  final Map<Encoding, Integer> dataStats;
+  private final boolean usesV2Pages;
+
+  private EncodingStats(Map<Encoding, Integer> dictStats,
+                        Map<Encoding, Integer> dataStats,
+                        boolean usesV2Pages) {
+    this.dictStats = dictStats;
+    this.dataStats = dataStats;
+    this.usesV2Pages = usesV2Pages;
+  }
+
+  public Set<Encoding> getDictionaryEncodings() {
+    return dictStats.keySet();
+  }
+
+  public Set<Encoding> getDataEncodings() {
+    return dataStats.keySet();
+  }
+
+  public int getNumDictionaryPagesEncodedAs(Encoding enc) {
+    if (dictStats.containsKey(enc)) {
+      return dictStats.get(enc);
+    } else {
+      return 0;
+    }
+  }
+
+  public int getNumDataPagesEncodedAs(Encoding enc) {
+    if (dataStats.containsKey(enc)) {
+      return dataStats.get(enc);
+    } else {
+      return 0;
+    }
+  }
+
+  public boolean hasDictionaryPages() {
+    return !dictStats.isEmpty();
+  }
+
+  public boolean hasDictionaryEncodedPages() {
+    Set<Encoding> encodings = dataStats.keySet();
+    return (encodings.contains(RLE_DICTIONARY) || encodings.contains(PLAIN_DICTIONARY));
+  }
+
+  public boolean hasNonDictionaryEncodedPages() {
+    if (dataStats.isEmpty()) {
+      return false; // no pages
+    }
+
+    // this modifies the set, so copy it
+    Set<Encoding> encodings = new HashSet<Encoding>(dataStats.keySet());
+    if (!encodings.remove(RLE_DICTIONARY) &&
+        !encodings.remove(PLAIN_DICTIONARY)) {
+      return true; // not dictionary encoded
+    }
+
+    if (encodings.isEmpty()) {
+      return false;
+    }
+
+    // at least one non-dictionary encoding is present
+    return true;
+  }
+
+  public boolean usesV2Pages() {
+    return usesV2Pages;
+  }
+
+  /**
+   * Used to build {@link EncodingStats} from metadata or to accumulate stats as pages are
written.
+   */
+  public static class Builder {
+    private final Map<Encoding, Integer> dictStats = new LinkedHashMap<Encoding,
Integer>();
+    private final Map<Encoding, Integer> dataStats = new LinkedHashMap<Encoding,
Integer>();
+    private boolean usesV2Pages = false;
+
+    public Builder clear() {
+      this.usesV2Pages = false;
+      dictStats.clear();
+      dataStats.clear();
+      return this;
+    }
+
+    public Builder withV2Pages() {
+      this.usesV2Pages = true;
+      return this;
+    }
+
+    public Builder addDictEncoding(Encoding encoding) {
+      return addDictEncoding(encoding, 1);
+    }
+
+    public Builder addDictEncoding(Encoding encoding, int numPages) {
+      Integer pages = dictStats.get(encoding);
+      dictStats.put(encoding, numPages + (pages != null ? pages : 0));
+      return this;
+    }
+
+    public Builder addDataEncodings(Collection<Encoding> encodings) {
+      for (Encoding encoding : encodings) {
+        addDataEncoding(encoding);
+      }
+      return this;
+    }
+
+    public Builder addDataEncoding(Encoding encoding) {
+      return addDataEncoding(encoding, 1);
+    }
+
+    public Builder addDataEncoding(Encoding encoding, int numPages) {
+      Integer pages = dataStats.get(encoding);
+      dataStats.put(encoding, numPages + (pages != null ? pages : 0));
+      return this;
+    }
+
+    public EncodingStats build() {
+      return new EncodingStats(
+          Collections.unmodifiableMap(new LinkedHashMap<Encoding, Integer>(dictStats)),
+          Collections.unmodifiableMap(new LinkedHashMap<Encoding, Integer>(dataStats)),
+          usesV2Pages);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/3dd2210e/parquet-column/src/test/java/org/apache/parquet/column/TestEncodingStats.java
----------------------------------------------------------------------
diff --git a/parquet-column/src/test/java/org/apache/parquet/column/TestEncodingStats.java
b/parquet-column/src/test/java/org/apache/parquet/column/TestEncodingStats.java
new file mode 100644
index 0000000..4c46688
--- /dev/null
+++ b/parquet-column/src/test/java/org/apache/parquet/column/TestEncodingStats.java
@@ -0,0 +1,202 @@
+/*
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ */
+
+package org.apache.parquet.column;
+
+import org.junit.Test;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+public class TestEncodingStats {
+  @Test
+  public void testReusedBuilder() {
+    EncodingStats.Builder builder = new EncodingStats.Builder();
+    builder.withV2Pages();
+    builder.addDictEncoding(Encoding.PLAIN);
+    builder.addDataEncoding(Encoding.RLE_DICTIONARY, 3);
+    builder.addDataEncoding(Encoding.DELTA_BYTE_ARRAY);
+    builder.addDataEncoding(Encoding.DELTA_BYTE_ARRAY);
+    EncodingStats stats1 = builder.build();
+
+    Map<Encoding, Integer> expectedDictStats1 = new HashMap<Encoding, Integer>();
+    expectedDictStats1.put(Encoding.PLAIN, 1);
+    Map<Encoding, Integer> expectedDataStats1 = new HashMap<Encoding, Integer>();
+    expectedDataStats1.put(Encoding.RLE_DICTIONARY, 3);
+    expectedDataStats1.put(Encoding.DELTA_BYTE_ARRAY, 2);
+
+    builder.clear();
+    builder.addDataEncoding(Encoding.PLAIN);
+    builder.addDataEncoding(Encoding.PLAIN);
+    builder.addDataEncoding(Encoding.PLAIN);
+    builder.addDataEncoding(Encoding.PLAIN);
+    EncodingStats stats2 = builder.build();
+
+    Map<Encoding, Integer> expectedDictStats2 = new HashMap<Encoding, Integer>();
+    Map<Encoding, Integer> expectedDataStats2 = new HashMap<Encoding, Integer>();
+    expectedDataStats2.put(Encoding.PLAIN, 4);
+
+    assertEquals("Dictionary stats should be correct", expectedDictStats2, stats2.dictStats);
+    assertEquals("Data stats should be correct", expectedDataStats2, stats2.dataStats);
+
+    assertEquals("Dictionary stats should be correct after reuse", expectedDictStats1, stats1.dictStats);
+    assertEquals("Data stats should be correct after reuse", expectedDataStats1, stats1.dataStats);
+  }
+
+  @Test
+  public void testNoPages() {
+    EncodingStats.Builder builder = new EncodingStats.Builder();
+    EncodingStats stats = builder.build();
+
+    assertFalse(stats.usesV2Pages());
+    assertFalse("Should not have dictionary-encoded pages", stats.hasDictionaryEncodedPages());
+    assertFalse("Should not have non-dictionary pages", stats.hasNonDictionaryEncodedPages());
+    assertFalse("Should not have dictionary pages", stats.hasDictionaryPages());
+  }
+
+  @Test
+  public void testNoDataPages() {
+    EncodingStats.Builder builder = new EncodingStats.Builder();
+    builder.addDictEncoding(Encoding.PLAIN_DICTIONARY);
+    EncodingStats stats = builder.build();
+
+    assertFalse(stats.usesV2Pages());
+    assertFalse("Should not have dictionary-encoded pages", stats.hasDictionaryEncodedPages());
+    assertFalse("Should not have non-dictionary pages", stats.hasNonDictionaryEncodedPages());
+    assertTrue("Should have dictionary pages", stats.hasDictionaryPages());
+  }
+
+  @Test
+  public void testV1AllDictionary() {
+    EncodingStats.Builder builder = new EncodingStats.Builder();
+    builder.addDictEncoding(Encoding.PLAIN_DICTIONARY);
+    builder.addDataEncoding(Encoding.PLAIN_DICTIONARY);
+    builder.addDataEncoding(Encoding.PLAIN_DICTIONARY);
+    EncodingStats stats = builder.build();
+
+    assertFalse(stats.usesV2Pages());
+    assertTrue("Should have dictionary-encoded pages", stats.hasDictionaryEncodedPages());
+    assertFalse("Should not have non-dictionary pages", stats.hasNonDictionaryEncodedPages());
+    assertTrue("Should have dictionary pages", stats.hasDictionaryPages());
+  }
+
+  @Test
+  public void testV1NoDictionary() {
+    EncodingStats.Builder builder = new EncodingStats.Builder();
+    builder.addDataEncoding(Encoding.PLAIN);
+    EncodingStats stats = builder.build();
+
+    assertFalse(stats.usesV2Pages());
+    assertFalse("Should not have dictionary-encoded pages", stats.hasDictionaryEncodedPages());
+    assertTrue("Should have non-dictionary pages", stats.hasNonDictionaryEncodedPages());
+    assertFalse("Should not have dictionary pages", stats.hasDictionaryPages());
+  }
+
+  @Test
+  public void testV1Fallback() {
+    EncodingStats.Builder builder = new EncodingStats.Builder();
+    builder.addDictEncoding(Encoding.PLAIN_DICTIONARY);
+    builder.addDataEncoding(Encoding.PLAIN_DICTIONARY);
+    builder.addDataEncoding(Encoding.PLAIN_DICTIONARY);
+    builder.addDataEncoding(Encoding.PLAIN);
+    EncodingStats stats = builder.build();
+
+    assertFalse(stats.usesV2Pages());
+    assertTrue("Should have dictionary-encoded pages", stats.hasDictionaryEncodedPages());
+    assertTrue("Should have non-dictionary pages", stats.hasNonDictionaryEncodedPages());
+    assertTrue("Should have dictionary pages", stats.hasDictionaryPages());
+  }
+
+  @Test
+  public void testV2AllDictionary() {
+    EncodingStats.Builder builder = new EncodingStats.Builder();
+    builder.withV2Pages();
+    builder.addDictEncoding(Encoding.PLAIN);
+    builder.addDataEncoding(Encoding.RLE_DICTIONARY);
+    EncodingStats stats = builder.build();
+
+    assertTrue(stats.usesV2Pages());
+    assertTrue("Should have dictionary-encoded pages", stats.hasDictionaryEncodedPages());
+    assertFalse("Should not have non-dictionary pages", stats.hasNonDictionaryEncodedPages());
+    assertTrue("Should have dictionary pages", stats.hasDictionaryPages());
+  }
+
+  @Test
+  public void testV2NoDictionary() {
+    EncodingStats.Builder builder = new EncodingStats.Builder();
+    builder.withV2Pages();
+    builder.addDataEncoding(Encoding.DELTA_BINARY_PACKED);
+    builder.addDataEncoding(Encoding.DELTA_BINARY_PACKED);
+    EncodingStats stats = builder.build();
+
+    assertTrue(stats.usesV2Pages());
+    assertFalse("Should not have dictionary-encoded pages", stats.hasDictionaryEncodedPages());
+    assertTrue("Should have non-dictionary pages", stats.hasNonDictionaryEncodedPages());
+    assertFalse("Should not have dictionary pages", stats.hasDictionaryPages());
+  }
+
+  @Test
+  public void testV2Fallback() {
+    EncodingStats.Builder builder = new EncodingStats.Builder();
+    builder.withV2Pages();
+    builder.addDictEncoding(Encoding.PLAIN);
+    builder.addDataEncoding(Encoding.RLE_DICTIONARY);
+    builder.addDataEncoding(Encoding.DELTA_BYTE_ARRAY);
+    builder.addDataEncoding(Encoding.DELTA_BYTE_ARRAY);
+    EncodingStats stats = builder.build();
+
+    assertTrue(stats.usesV2Pages());
+    assertTrue("Should have dictionary-encoded pages", stats.hasDictionaryEncodedPages());
+    assertTrue("Should have non-dictionary pages", stats.hasNonDictionaryEncodedPages());
+    assertTrue("Should have dictionary pages", stats.hasDictionaryPages());
+  }
+
+  @Test
+  public void testCounts() {
+    EncodingStats.Builder builder = new EncodingStats.Builder();
+    builder.withV2Pages();
+    builder.addDictEncoding(Encoding.PLAIN);
+    builder.addDataEncoding(Encoding.RLE_DICTIONARY, 4);
+    builder.addDataEncoding(Encoding.RLE_DICTIONARY);
+    builder.addDataEncoding(Encoding.DELTA_BYTE_ARRAY);
+    builder.addDataEncoding(Encoding.DELTA_BYTE_ARRAY);
+    EncodingStats stats = builder.build();
+
+    assertEquals("Count should match", 1, stats.getNumDictionaryPagesEncodedAs(Encoding.PLAIN));
+    assertEquals("Count should match", 0, stats.getNumDictionaryPagesEncodedAs(Encoding.PLAIN_DICTIONARY));
+    assertEquals("Count should match", 0, stats.getNumDictionaryPagesEncodedAs(Encoding.RLE));
+    assertEquals("Count should match", 0, stats.getNumDictionaryPagesEncodedAs(Encoding.BIT_PACKED));
+    assertEquals("Count should match", 0, stats.getNumDictionaryPagesEncodedAs(Encoding.DELTA_BYTE_ARRAY));
+    assertEquals("Count should match", 0, stats.getNumDictionaryPagesEncodedAs(Encoding.DELTA_BINARY_PACKED));
+    assertEquals("Count should match", 0, stats.getNumDictionaryPagesEncodedAs(Encoding.DELTA_LENGTH_BYTE_ARRAY));
+
+    assertEquals("Count should match", 5, stats.getNumDataPagesEncodedAs(Encoding.RLE_DICTIONARY));
+    assertEquals("Count should match", 2, stats.getNumDataPagesEncodedAs(Encoding.DELTA_BYTE_ARRAY));
+    assertEquals("Count should match", 0, stats.getNumDataPagesEncodedAs(Encoding.RLE));
+    assertEquals("Count should match", 0, stats.getNumDataPagesEncodedAs(Encoding.BIT_PACKED));
+    assertEquals("Count should match", 0, stats.getNumDataPagesEncodedAs(Encoding.PLAIN));
+    assertEquals("Count should match", 0, stats.getNumDataPagesEncodedAs(Encoding.PLAIN_DICTIONARY));
+    assertEquals("Count should match", 0, stats.getNumDataPagesEncodedAs(Encoding.DELTA_BINARY_PACKED));
+    assertEquals("Count should match", 0, stats.getNumDataPagesEncodedAs(Encoding.DELTA_LENGTH_BYTE_ARRAY));
+  }
+}

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/3dd2210e/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java
b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java
index 6235c20..9b03f82 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java
@@ -22,6 +22,7 @@ import org.apache.parquet.Log;
 import org.apache.parquet.column.ColumnDescriptor;
 import org.apache.parquet.column.Dictionary;
 import org.apache.parquet.column.Encoding;
+import org.apache.parquet.column.EncodingStats;
 import org.apache.parquet.column.page.DictionaryPage;
 import org.apache.parquet.column.page.DictionaryPageReadStore;
 import org.apache.parquet.filter2.predicate.FilterPredicate;
@@ -329,6 +330,11 @@ public class DictionaryFilter implements FilterPredicate.Visitor<Boolean>
{
 
   @SuppressWarnings("deprecation")
   private static boolean hasNonDictionaryPages(ColumnChunkMetaData meta) {
+    EncodingStats stats = meta.getEncodingStats();
+    if (stats != null) {
+      return stats.hasNonDictionaryEncodedPages();
+    }
+
     // without EncodingStats, fall back to testing the encoding list
     Set<Encoding> encodings = new HashSet<Encoding>(meta.getEncodings());
     if (encodings.remove(Encoding.PLAIN_DICTIONARY)) {

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/3dd2210e/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
index 48f295e..6feb4a2 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
@@ -38,6 +38,7 @@ import java.util.concurrent.ConcurrentHashMap;
 
 import org.apache.parquet.CorruptStatistics;
 import org.apache.parquet.Log;
+import org.apache.parquet.format.PageEncodingStats;
 import org.apache.parquet.hadoop.metadata.ColumnPath;
 import org.apache.parquet.format.ColumnChunk;
 import org.apache.parquet.format.ColumnMetaData;
@@ -58,6 +59,7 @@ import org.apache.parquet.format.Type;
 import org.apache.parquet.hadoop.metadata.BlockMetaData;
 import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
 import org.apache.parquet.hadoop.metadata.CompressionCodecName;
+import org.apache.parquet.column.EncodingStats;
 import org.apache.parquet.hadoop.metadata.ParquetMetadata;
 import org.apache.parquet.io.ParquetDecodingException;
 import org.apache.parquet.schema.GroupType;
@@ -183,6 +185,9 @@ public class ParquetMetadataConverter {
       if (!columnMetaData.getStatistics().isEmpty()) {
         columnChunk.meta_data.setStatistics(toParquetStatistics(columnMetaData.getStatistics()));
       }
+      if (columnMetaData.getEncodingStats() != null) {
+        columnChunk.meta_data.setEncoding_stats(convertEncodingStats(columnMetaData.getEncodingStats()));
+      }
 //      columnChunk.meta_data.index_page_offset = ;
 //      columnChunk.meta_data.key_value_metadata = ; // nothing yet
 
@@ -232,6 +237,50 @@ public class ParquetMetadataConverter {
     return Encoding.valueOf(encoding.name());
   }
 
+  public EncodingStats convertEncodingStats(List<PageEncodingStats> stats) {
+    if (stats == null) {
+      return null;
+    }
+
+    EncodingStats.Builder builder = new EncodingStats.Builder();
+    for (PageEncodingStats stat : stats) {
+      switch (stat.getPage_type()) {
+        case DATA_PAGE_V2:
+          builder.withV2Pages();
+          // falls through
+        case DATA_PAGE:
+          builder.addDataEncoding(
+              getEncoding(stat.getEncoding()), stat.getCount());
+          break;
+        case DICTIONARY_PAGE:
+          builder.addDictEncoding(
+              getEncoding(stat.getEncoding()), stat.getCount());
+          break;
+      }
+    }
+    return builder.build();
+  }
+
+  public List<PageEncodingStats> convertEncodingStats(EncodingStats stats) {
+    if (stats == null) {
+      return null;
+    }
+
+    List<PageEncodingStats> formatStats = new ArrayList<PageEncodingStats>();
+    for (org.apache.parquet.column.Encoding encoding : stats.getDictionaryEncodings()) {
+      formatStats.add(new PageEncodingStats(
+          PageType.DICTIONARY_PAGE, getEncoding(encoding),
+          stats.getNumDictionaryPagesEncodedAs(encoding)));
+    }
+    PageType dataPageType = (stats.usesV2Pages() ? PageType.DATA_PAGE_V2 : PageType.DATA_PAGE);
+    for (org.apache.parquet.column.Encoding encoding : stats.getDataEncodings()) {
+      formatStats.add(new PageEncodingStats(
+          dataPageType, getEncoding(encoding),
+          stats.getNumDataPagesEncodedAs(encoding)));
+    }
+    return formatStats;
+  }
+
   public static Statistics toParquetStatistics(
       org.apache.parquet.column.statistics.Statistics statistics) {
     Statistics stats = new Statistics();
@@ -613,6 +662,7 @@ public class ParquetMetadataConverter {
               path,
               messageType.getType(path.toArray()).asPrimitiveType().getPrimitiveTypeName(),
               CompressionCodecName.fromParquet(metaData.codec),
+              convertEncodingStats(metaData.getEncoding_stats()),
               fromFormatEncodings(metaData.encodings),
               fromParquetStatistics(
                   parquetMetadata.getCreated_by(),

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/3dd2210e/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java
b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java
index 2eab54a..0fb9a18 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java
@@ -26,6 +26,7 @@ import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Map;
 import java.util.Set;
 
@@ -63,7 +64,10 @@ class ColumnChunkPageWriteStore implements PageWriteStore {
     private long totalValueCount;
     private int pageCount;
 
-    private Set<Encoding> encodings = new HashSet<Encoding>();
+    // repetition and definition level encodings are used only for v1 pages and don't change
+    private Set<Encoding> rlEncodings = new HashSet<Encoding>();
+    private Set<Encoding> dlEncodings = new HashSet<Encoding>();
+    private List<Encoding> dataEncodings = new ArrayList<Encoding>();
 
     private Statistics totalStatistics;
     private final ByteBufferAllocator allocator;
@@ -116,9 +120,9 @@ class ColumnChunkPageWriteStore implements PageWriteStore {
       // by concatenating before collecting instead of collecting twice,
       // we only allocate one buffer to copy into instead of multiple.
       buf.collect(BytesInput.concat(BytesInput.from(tempOutputStream), compressedBytes));
-      encodings.add(rlEncoding);
-      encodings.add(dlEncoding);
-      encodings.add(valuesEncoding);
+      rlEncodings.add(rlEncoding);
+      dlEncodings.add(dlEncoding);
+      dataEncodings.add(valuesEncoding);
     }
 
     @Override
@@ -161,7 +165,7 @@ class ColumnChunkPageWriteStore implements PageWriteStore {
               definitionLevels,
               compressedData)
       );
-      encodings.add(dataEncoding);
+      dataEncodings.add(dataEncoding);
     }
 
     private int toIntWithCheck(long size) {
@@ -182,21 +186,24 @@ class ColumnChunkPageWriteStore implements PageWriteStore {
       writer.startColumn(path, totalValueCount, compressor.getCodecName());
       if (dictionaryPage != null) {
         writer.writeDictionaryPage(dictionaryPage);
-        encodings.add(dictionaryPage.getEncoding());
+        // tracking the dictionary encoding is handled in writeDictionaryPage
       }
-      writer.writeDataPages(buf, uncompressedLength, compressedLength, totalStatistics, new
ArrayList<Encoding>(encodings));
+      writer.writeDataPages(buf, uncompressedLength, compressedLength, totalStatistics,
+          rlEncodings, dlEncodings, dataEncodings);
       writer.endColumn();
       if (INFO) {
         LOG.info(
             String.format(
                 "written %,dB for %s: %,d values, %,dB raw, %,dB comp, %d pages, encodings:
%s",
-                buf.size(), path, totalValueCount, uncompressedLength, compressedLength,
pageCount, encodings)
+                buf.size(), path, totalValueCount, uncompressedLength, compressedLength,
pageCount, new HashSet<Encoding>(dataEncodings))
                 + (dictionaryPage != null ? String.format(
                 ", dic { %,d entries, %,dB raw, %,dB comp}",
                 dictionaryPage.getDictionarySize(), dictionaryPage.getUncompressedSize(),
dictionaryPage.getDictionarySize())
                 : ""));
       }
-      encodings.clear();
+      rlEncodings.clear();
+      dlEncodings.clear();
+      dataEncodings.clear();
       pageCount = 0;
     }
 

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/3dd2210e/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/DictionaryPageReader.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/DictionaryPageReader.java
b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/DictionaryPageReader.java
index cb0d5e7..9a99358 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/DictionaryPageReader.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/DictionaryPageReader.java
@@ -21,6 +21,7 @@ package org.apache.parquet.hadoop;
 import org.apache.parquet.Strings;
 import org.apache.parquet.column.ColumnDescriptor;
 import org.apache.parquet.column.Encoding;
+import org.apache.parquet.column.EncodingStats;
 import org.apache.parquet.column.page.DictionaryPage;
 import org.apache.parquet.column.page.DictionaryPageReadStore;
 import org.apache.parquet.hadoop.metadata.BlockMetaData;
@@ -104,6 +105,12 @@ class DictionaryPageReader implements DictionaryPageReadStore {
   }
 
   private boolean hasDictionaryPage(ColumnChunkMetaData column) {
+    EncodingStats stats = column.getEncodingStats();
+    if (stats != null) {
+      // ensure there is a dictionary page and that it is used to encode data pages
+      return stats.hasDictionaryPages() && stats.hasDictionaryEncodedPages();
+    }
+
     Set<Encoding> encodings = column.getEncodings();
     return (encodings.contains(PLAIN_DICTIONARY) || encodings.contains(RLE_DICTIONARY));
   }

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/3dd2210e/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java
b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java
index 442d3f2..523d01f 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java
@@ -49,6 +49,7 @@ import org.apache.parquet.bytes.BytesInput;
 import org.apache.parquet.bytes.BytesUtils;
 import org.apache.parquet.column.ColumnDescriptor;
 import org.apache.parquet.column.Encoding;
+import org.apache.parquet.column.EncodingStats;
 import org.apache.parquet.column.page.DictionaryPage;
 import org.apache.parquet.column.statistics.Statistics;
 import org.apache.parquet.hadoop.ParquetOutputFormat.JobSummaryLevel;
@@ -118,6 +119,7 @@ public class ParquetFileWriter {
   private long currentRecordCount; // set in startBlock
 
   // column chunk data accumulated as pages are written
+  private EncodingStats.Builder encodingStatsBuilder;
   private Set<Encoding> currentEncodings;
   private long uncompressedLength;
   private long compressedLength;
@@ -239,6 +241,8 @@ public class ParquetFileWriter {
       this.alignment = NoAlignment.get(rowGroupSize);
       this.out = fs.create(file, overwriteFlag);
     }
+
+    this.encodingStatsBuilder = new EncodingStats.Builder();
   }
 
   /**
@@ -259,6 +263,7 @@ public class ParquetFileWriter {
         rowAndBlockSize, rowAndBlockSize, maxPaddingSize);
     this.out = fs.create(file, true, DFS_BUFFER_SIZE_DEFAULT,
         fs.getDefaultReplication(file), rowAndBlockSize);
+    this.encodingStatsBuilder = new EncodingStats.Builder();
   }
 
   /**
@@ -298,6 +303,7 @@ public class ParquetFileWriter {
                           long valueCount,
                           CompressionCodecName compressionCodecName) throws IOException {
     state = state.startColumn();
+    encodingStatsBuilder.clear();
     currentEncodings = new HashSet<Encoding>();
     currentChunkPath = ColumnPath.get(descriptor.getPath());
     currentChunkType = descriptor.getType();
@@ -332,6 +338,7 @@ public class ParquetFileWriter {
     this.compressedLength += compressedPageSize + headerSize;
     if (DEBUG) LOG.debug(out.getPos() + ": write dictionary page content " + compressedPageSize);
     dictionaryPage.getBytes().writeAllTo(out);
+    encodingStatsBuilder.addDictEncoding(dictionaryPage.getEncoding());
     currentEncodings.add(dictionaryPage.getEncoding());
   }
 
@@ -368,6 +375,7 @@ public class ParquetFileWriter {
     this.compressedLength += compressedPageSize + headerSize;
     if (DEBUG) LOG.debug(out.getPos() + ": write data page content " + compressedPageSize);
     bytes.writeAllTo(out);
+    encodingStatsBuilder.addDataEncoding(valuesEncoding);
     currentEncodings.add(rlEncoding);
     currentEncodings.add(dlEncoding);
     currentEncodings.add(valuesEncoding);
@@ -407,6 +415,7 @@ public class ParquetFileWriter {
     if (DEBUG) LOG.debug(out.getPos() + ": write data page content " + compressedPageSize);
     bytes.writeAllTo(out);
     currentStatistics.mergeStatistics(statistics);
+    encodingStatsBuilder.addDataEncoding(valuesEncoding);
     currentEncodings.add(rlEncoding);
     currentEncodings.add(dlEncoding);
     currentEncodings.add(valuesEncoding);
@@ -419,11 +428,13 @@ public class ParquetFileWriter {
    * @param compressedTotalPageSize total compressed size (without page headers)
    * @throws IOException
    */
-   void writeDataPages(BytesInput bytes,
-                       long uncompressedTotalPageSize,
-                       long compressedTotalPageSize,
-                       Statistics totalStats,
-                       List<Encoding> encodings) throws IOException {
+  void writeDataPages(BytesInput bytes,
+                      long uncompressedTotalPageSize,
+                      long compressedTotalPageSize,
+                      Statistics totalStats,
+                      Set<Encoding> rlEncodings,
+                      Set<Encoding> dlEncodings,
+                      List<Encoding> dataEncodings) throws IOException {
     state = state.write();
     if (DEBUG) LOG.debug(out.getPos() + ": write data pages");
     long headersSize = bytes.size() - compressedTotalPageSize;
@@ -431,7 +442,13 @@ public class ParquetFileWriter {
     this.compressedLength += compressedTotalPageSize + headersSize;
     if (DEBUG) LOG.debug(out.getPos() + ": write data pages content");
     bytes.writeAllTo(out);
-    currentEncodings.addAll(encodings);
+    encodingStatsBuilder.addDataEncodings(dataEncodings);
+    if (rlEncodings.isEmpty()) {
+      encodingStatsBuilder.withV2Pages();
+    }
+    currentEncodings.addAll(rlEncodings);
+    currentEncodings.addAll(dlEncodings);
+    currentEncodings.addAll(dataEncodings);
     currentStatistics = totalStats;
   }
 
@@ -446,6 +463,7 @@ public class ParquetFileWriter {
         currentChunkPath,
         currentChunkType,
         currentChunkCodec,
+        encodingStatsBuilder.build(),
         currentEncodings,
         currentStatistics,
         currentChunkFirstDataPage,
@@ -543,6 +561,7 @@ public class ParquetFileWriter {
           chunk.getPath(),
           chunk.getType(),
           chunk.getCodec(),
+          chunk.getEncodingStats(),
           chunk.getEncodings(),
           chunk.getStatistics(),
           newChunkStart,

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/3dd2210e/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java
b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java
index 0c2fd4d..720bd77 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java
@@ -21,6 +21,7 @@ package org.apache.parquet.hadoop.metadata;
 import java.util.Set;
 
 import org.apache.parquet.column.Encoding;
+import org.apache.parquet.column.EncodingStats;
 import org.apache.parquet.column.statistics.BooleanStatistics;
 import org.apache.parquet.column.statistics.Statistics;
 import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
@@ -42,37 +43,33 @@ abstract public class ColumnChunkMetaData {
       long valueCount,
       long totalSize,
       long totalUncompressedSize) {
-    // to save space we store those always positive longs in ints when they fit.
-    if (positiveLongFitsInAnInt(firstDataPage)
-        && positiveLongFitsInAnInt(dictionaryPageOffset)
-        && positiveLongFitsInAnInt(valueCount)
-        && positiveLongFitsInAnInt(totalSize)
-        && positiveLongFitsInAnInt(totalUncompressedSize)) {
-      return new IntColumnChunkMetaData(
-          path, type, codec, encodings,
-          new BooleanStatistics(),
-          firstDataPage,
-          dictionaryPageOffset,
-          valueCount,
-          totalSize,
-          totalUncompressedSize);
-    } else {
-      return new LongColumnChunkMetaData(
-          path, type, codec, encodings,
-          new BooleanStatistics(),
-          firstDataPage,
-          dictionaryPageOffset,
-          valueCount,
-          totalSize,
-          totalUncompressedSize);
-    }
+    return get(
+        path, type, codec, null, encodings, new BooleanStatistics(), firstDataPage,
+        dictionaryPageOffset, valueCount, totalSize, totalUncompressedSize);
   }
 
+  @Deprecated
+  public static ColumnChunkMetaData get(
+      ColumnPath path,
+      PrimitiveTypeName type,
+      CompressionCodecName codec,
+      Set<Encoding> encodings,
+      Statistics statistics,
+      long firstDataPage,
+      long dictionaryPageOffset,
+      long valueCount,
+      long totalSize,
+      long totalUncompressedSize) {
+    return get(
+        path, type, codec, null, encodings, statistics, firstDataPage, dictionaryPageOffset,
+        valueCount, totalSize, totalUncompressedSize);
+  }
 
   public static ColumnChunkMetaData get(
       ColumnPath path,
       PrimitiveTypeName type,
       CompressionCodecName codec,
+      EncodingStats encodingStats,
       Set<Encoding> encodings,
       Statistics statistics,
       long firstDataPage,
@@ -87,7 +84,8 @@ abstract public class ColumnChunkMetaData {
         && positiveLongFitsInAnInt(totalSize)
         && positiveLongFitsInAnInt(totalUncompressedSize)) {
       return new IntColumnChunkMetaData(
-          path, type, codec, encodings,
+          path, type, codec,
+          encodingStats, encodings,
           statistics,
           firstDataPage,
           dictionaryPageOffset,
@@ -96,7 +94,8 @@ abstract public class ColumnChunkMetaData {
           totalUncompressedSize);
     } else {
       return new LongColumnChunkMetaData(
-          path, type, codec, encodings,
+          path, type, codec,
+          encodingStats, encodings,
           statistics,
           firstDataPage,
           dictionaryPageOffset,
@@ -129,10 +128,17 @@ abstract public class ColumnChunkMetaData {
     return (value >= 0) && (value + Integer.MIN_VALUE <= Integer.MAX_VALUE);
   }
 
+  private final EncodingStats encodingStats;
+
   // we save 3 references by storing together the column properties that have few distinct
values
   private final ColumnChunkProperties properties;
 
   protected ColumnChunkMetaData(ColumnChunkProperties columnChunkProperties) {
+    this(null, columnChunkProperties);
+  }
+
+  protected ColumnChunkMetaData(EncodingStats encodingStats, ColumnChunkProperties columnChunkProperties)
{
+    this.encodingStats = encodingStats;
     this.properties = columnChunkProperties;
   }
 
@@ -192,6 +198,9 @@ abstract public class ColumnChunkMetaData {
     return properties.getEncodings();
   }
 
+  public EncodingStats getEncodingStats() {
+    return encodingStats;
+  }
 
   @Override
   public String toString() {
@@ -224,6 +233,7 @@ class IntColumnChunkMetaData extends ColumnChunkMetaData {
       ColumnPath path,
       PrimitiveTypeName type,
       CompressionCodecName codec,
+      EncodingStats encodingStats,
       Set<Encoding> encodings,
       Statistics statistics,
       long firstDataPage,
@@ -231,7 +241,7 @@ class IntColumnChunkMetaData extends ColumnChunkMetaData {
       long valueCount,
       long totalSize,
       long totalUncompressedSize) {
-    super(ColumnChunkProperties.get(path, type, codec, encodings));
+    super(encodingStats, ColumnChunkProperties.get(path, type, codec, encodings));
     this.firstDataPage = positiveLongToInt(firstDataPage);
     this.dictionaryPageOffset = positiveLongToInt(dictionaryPageOffset);
     this.valueCount = positiveLongToInt(valueCount);
@@ -328,6 +338,7 @@ class LongColumnChunkMetaData extends ColumnChunkMetaData {
       ColumnPath path,
       PrimitiveTypeName type,
       CompressionCodecName codec,
+      EncodingStats encodingStats,
       Set<Encoding> encodings,
       Statistics statistics,
       long firstDataPageOffset,
@@ -335,7 +346,7 @@ class LongColumnChunkMetaData extends ColumnChunkMetaData {
       long valueCount,
       long totalSize,
       long totalUncompressedSize) {
-    super(ColumnChunkProperties.get(path, type, codec, encodings));
+    super(encodingStats, ColumnChunkProperties.get(path, type, codec, encodings));
     this.firstDataPageOffset = firstDataPageOffset;
     this.dictionaryPageOffset = dictionaryPageOffset;
     this.valueCount = valueCount;

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/3dd2210e/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestReadWriteEncodingStats.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestReadWriteEncodingStats.java
b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestReadWriteEncodingStats.java
new file mode 100644
index 0000000..69e11c1
--- /dev/null
+++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestReadWriteEncodingStats.java
@@ -0,0 +1,121 @@
+/*
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ */
+
+package org.apache.parquet.hadoop;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.parquet.column.EncodingStats;
+import org.apache.parquet.example.data.Group;
+import org.apache.parquet.example.data.simple.SimpleGroupFactory;
+import org.apache.parquet.hadoop.example.ExampleParquetWriter;
+import org.apache.parquet.hadoop.metadata.BlockMetaData;
+import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
+import org.apache.parquet.schema.MessageType;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.UUID;
+
+import static org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_1_0;
+import static org.apache.parquet.schema.MessageTypeParser.parseMessageType;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Tests that files are written with EncodingStats, the stats are readable, and generally
correct.
+ */
+public class TestReadWriteEncodingStats {
+
+  @Rule
+  public TemporaryFolder temp = new TemporaryFolder();
+
+  private static final Configuration CONF = new Configuration();
+  private static final int NUM_RECORDS = 1000;
+  private static final MessageType SCHEMA = parseMessageType(
+      "message test { "
+          + "required binary dict_binary_field; "
+          + "required int32 plain_int32_field; "
+          + "required binary fallback_binary_field; "
+          + "} ");
+
+  private static final String ALPHABET = "abcdefghijklmnopqrstuvwxyz";
+
+  private static void writeData(ParquetWriter<Group> writer) throws IOException {
+    SimpleGroupFactory f = new SimpleGroupFactory(SCHEMA);
+    for (int i = 0; i < NUM_RECORDS; i += 1) {
+      int index = i % ALPHABET.length();
+
+      Group group = f.newGroup()
+          .append("dict_binary_field", ALPHABET.substring(index, index+1))
+          .append("plain_int32_field", i)
+          .append("fallback_binary_field", i < (NUM_RECORDS / 2) ?
+              ALPHABET.substring(index, index+1) : UUID.randomUUID().toString());
+
+      writer.write(group);
+    }
+  }
+  @Test
+  public void testReadWrite() throws Exception {
+    File file = temp.newFile("encoding-stats.parquet");
+    assertTrue(file.delete());
+    Path path = new Path(file.toString());
+
+    ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
+        .withWriterVersion(PARQUET_1_0)
+        .withPageSize(1024) // ensure multiple pages are written
+        .enableDictionaryEncoding()
+        .withDictionaryPageSize(2*1024)
+        .withConf(CONF)
+        .withType(SCHEMA)
+        .build();
+    writeData(writer);
+    writer.close();
+
+    ParquetFileReader reader = ParquetFileReader.open(CONF, path);
+    assertEquals("Should have one row group", 1, reader.getRowGroups().size());
+    BlockMetaData rowGroup = reader.getRowGroups().get(0);
+
+    ColumnChunkMetaData dictColumn = rowGroup.getColumns().get(0);
+    EncodingStats dictStats = dictColumn.getEncodingStats();
+    assertNotNull("Dict column should have non-null encoding stats", dictStats);
+    assertTrue("Dict column should have a dict page", dictStats.hasDictionaryPages());
+    assertTrue("Dict column should have dict-encoded pages", dictStats.hasDictionaryEncodedPages());
+    assertFalse("Dict column should not have non-dict pages", dictStats.hasNonDictionaryEncodedPages());
+
+    ColumnChunkMetaData plainColumn = rowGroup.getColumns().get(1);
+    EncodingStats plainStats = plainColumn.getEncodingStats();
+    assertNotNull("Plain column should have non-null encoding stats", plainStats);
+    assertFalse("Plain column should not have a dict page", plainStats.hasDictionaryPages());
+    assertFalse("Plain column should not have dict-encoded pages", plainStats.hasDictionaryEncodedPages());
+    assertTrue("Plain column should have non-dict pages", plainStats.hasNonDictionaryEncodedPages());
+
+    ColumnChunkMetaData fallbackColumn = rowGroup.getColumns().get(2);
+    EncodingStats fallbackStats = fallbackColumn.getEncodingStats();
+    assertNotNull("Fallback column should have non-null encoding stats", fallbackStats);
+    assertTrue("Fallback column should have a dict page", fallbackStats.hasDictionaryPages());
+    assertTrue("Fallback column should have dict-encoded pages", fallbackStats.hasDictionaryEncodedPages());
+    assertTrue("Fallback column should have non-dict pages", fallbackStats.hasNonDictionaryEncodedPages());
+  }
+}


Mime
View raw message