parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From alexleven...@apache.org
Subject parquet-mr git commit: PARQUET-284: Clean up ParquetMetadataConverter
Date Wed, 24 Jun 2015 23:02:35 GMT
Repository: parquet-mr
Updated Branches:
  refs/heads/master 46448e934 -> 5c2ba72f9


PARQUET-284: Clean up ParquetMetadataConverter

makes all method static, removes unused thread-unsafe cache, etc.

Turns out the "cache" was only read from *after* rebuilding what needed to be cached... so
no performance gain there (and no loss by getting rid of it)

However, I don't know if this will fix the issue mentioned in PARQUET-284, I don't think concurrent
access to a HashMap will cause deadlock, it would just cause undefined behavior in reads or
maybe ConcurrentModificationException

UPDATE: I'm wrong, it can cause an infinite loop so this should fix the issue https://gist.github.com/rednaxelafx/1081908

UPDATE2: Put the cache back in, made it static + thread safe

Author: Alex Levenson <alexlevenson@twitter.com>

Closes #220 from isnotinvain/alexlevenson/PARQUET-284 and squashes the following commits:

4797b48 [Alex Levenson] Fix merge conflict issue
8ff5775 [Alex Levenson] Merge branch 'master' into alexlevenson/PARQUET-284
ccd4776 [Alex Levenson] add encoding cache back in
9ea5a5f [Alex Levenson] Clean up ParquetMetadataConverter: make all method static, remove
unused thread-unsafe cache


Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/5c2ba72f
Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/5c2ba72f
Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/5c2ba72f

Branch: refs/heads/master
Commit: 5c2ba72f9b4897d4441eff34ff0591e74a1d94bb
Parents: 46448e9
Author: Alex Levenson <alexlevenson@twitter.com>
Authored: Wed Jun 24 16:02:30 2015 -0700
Committer: Alex Levenson <alexlevenson@twitter.com>
Committed: Wed Jun 24 16:02:30 2015 -0700

----------------------------------------------------------------------
 .../converter/ParquetMetadataConverter.java     | 128 ++++++++++---------
 .../hadoop/ColumnChunkPageWriteStore.java       |   6 +-
 .../parquet/hadoop/ParquetFileReader.java       |  16 +--
 .../parquet/hadoop/ParquetFileWriter.java       |  10 +-
 .../converter/TestParquetMetadataConverter.java |  67 +++++++---
 5 files changed, 130 insertions(+), 97 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/5c2ba72f/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
index 859ec21..1481783 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
@@ -34,6 +34,7 @@ import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
 import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
 
 import org.apache.parquet.Log;
 import org.apache.parquet.hadoop.metadata.ColumnPath;
@@ -67,10 +68,25 @@ import org.apache.parquet.schema.Type.Repetition;
 import org.apache.parquet.schema.TypeVisitor;
 import org.apache.parquet.schema.Types;
 
+// TODO: This file has become too long!
+// TODO: Lets split it up: https://issues.apache.org/jira/browse/PARQUET-310
 public class ParquetMetadataConverter {
+  private ParquetMetadataConverter() { }
+
+  public static final MetadataFilter NO_FILTER = new NoFilter();
+  public static final MetadataFilter SKIP_ROW_GROUPS = new SkipMetadataFilter();
+
   private static final Log LOG = Log.getLog(ParquetMetadataConverter.class);
 
-  public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata)
{
+  // NOTE: this cache is for memory savings, not cpu savings, and is used to de-duplicate
+  // sets of encodings. It is important that all collections inserted to this cache be
+  // immutable and have thread-safe read-only access. This can be achieved by wrapping
+  // an unsynchronized collection in Collections.unmodifiable*(), and making sure to not
+  // keep any references to the original collection.
+  private static final ConcurrentHashMap<Set<org.apache.parquet.column.Encoding>,
Set<org.apache.parquet.column.Encoding>>
+      cachedEncodingSets = new ConcurrentHashMap<Set<org.apache.parquet.column.Encoding>,
Set<org.apache.parquet.column.Encoding>>();
+
+  public static FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata)
{
     List<BlockMetaData> blocks = parquetMetadata.getBlocks();
     List<RowGroup> rowGroups = new ArrayList<RowGroup>();
     int numRows = 0;
@@ -93,13 +109,14 @@ public class ParquetMetadataConverter {
     return fileMetaData;
   }
 
-  List<SchemaElement> toParquetSchema(MessageType schema) {
+  // Visible for testing
+  static List<SchemaElement> toParquetSchema(MessageType schema) {
     List<SchemaElement> result = new ArrayList<SchemaElement>();
     addToList(result, schema);
     return result;
   }
 
-  private void addToList(final List<SchemaElement> result, org.apache.parquet.schema.Type
field) {
+  private static void addToList(final List<SchemaElement> result, org.apache.parquet.schema.Type
field) {
     field.accept(new TypeVisitor() {
       @Override
       public void visit(PrimitiveType primitiveType) {
@@ -146,7 +163,7 @@ public class ParquetMetadataConverter {
     });
   }
 
-  private void addRowGroup(ParquetMetadata parquetMetadata, List<RowGroup> rowGroups,
BlockMetaData block) {
+  private static void addRowGroup(ParquetMetadata parquetMetadata, List<RowGroup> rowGroups,
BlockMetaData block) {
     //rowGroup.total_byte_size = ;
     List<ColumnChunkMetaData> columns = block.getColumns();
     List<ColumnChunk> parquetColumns = new ArrayList<ColumnChunk>();
@@ -175,7 +192,7 @@ public class ParquetMetadataConverter {
     rowGroups.add(rowGroup);
   }
 
-  private List<Encoding> toFormatEncodings(Set<org.apache.parquet.column.Encoding>
encodings) {
+  private static List<Encoding> toFormatEncodings(Set<org.apache.parquet.column.Encoding>
encodings) {
     List<Encoding> converted = new ArrayList<Encoding>(encodings.size());
     for (org.apache.parquet.column.Encoding encoding : encodings) {
       converted.add(getEncoding(encoding));
@@ -183,54 +200,35 @@ public class ParquetMetadataConverter {
     return converted;
   }
 
-  private static final class EncodingList {
-
-    private final Set<org.apache.parquet.column.Encoding> encodings;
-
-    public EncodingList(Set<org.apache.parquet.column.Encoding> encodings) {
-      this.encodings = encodings;
-    }
-
-    @Override
-    public boolean equals(Object obj) {
-      if (obj instanceof EncodingList) {
-        Set<org.apache.parquet.column.Encoding> other = ((EncodingList)obj).encodings;
-        return other.size() == encodings.size() && encodings.containsAll(other);
-      }
-      return false;
-    }
-
-    @Override
-    public int hashCode() {
-      int result = 1;
-      for (org.apache.parquet.column.Encoding element : encodings)
-        result = 31 * result + (element == null ? 0 : element.hashCode());
-      return result;
-    }
-  }
-
-  private Map<EncodingList, Set<org.apache.parquet.column.Encoding>> encodingLists
= new HashMap<EncodingList, Set<org.apache.parquet.column.Encoding>>();
-
-  private Set<org.apache.parquet.column.Encoding> fromFormatEncodings(List<Encoding>
encodings) {
+  // Visible for testing
+  static Set<org.apache.parquet.column.Encoding> fromFormatEncodings(List<Encoding>
encodings) {
     Set<org.apache.parquet.column.Encoding> converted = new HashSet<org.apache.parquet.column.Encoding>();
+
     for (Encoding encoding : encodings) {
       converted.add(getEncoding(encoding));
     }
+
+    // make converted unmodifiable, drop reference to modifiable copy
     converted = Collections.unmodifiableSet(converted);
-    EncodingList key = new EncodingList(converted);
-    Set<org.apache.parquet.column.Encoding> cached = encodingLists.get(key);
+
+    // atomically update the cache
+    Set<org.apache.parquet.column.Encoding> cached = cachedEncodingSets.putIfAbsent(converted,
converted);
+
     if (cached == null) {
+      // cached == null signifies that converted was *not* in the cache previously
+      // so we can return converted instead of throwing it away, it has now
+      // been cached
       cached = converted;
-      encodingLists.put(key, cached);
     }
+
     return cached;
   }
 
-  public org.apache.parquet.column.Encoding getEncoding(Encoding encoding) {
+  public static org.apache.parquet.column.Encoding getEncoding(Encoding encoding) {
     return org.apache.parquet.column.Encoding.valueOf(encoding.name());
   }
 
-  public Encoding getEncoding(org.apache.parquet.column.Encoding encoding) {
+  public static Encoding getEncoding(org.apache.parquet.column.Encoding encoding) {
     return Encoding.valueOf(encoding.name());
   }
 
@@ -259,7 +257,7 @@ public class ParquetMetadataConverter {
     return stats;
   }
 
-  public PrimitiveTypeName getPrimitive(Type type) {
+  public static PrimitiveTypeName getPrimitive(Type type) {
     switch (type) {
       case BYTE_ARRAY: // TODO: rename BINARY and remove this switch
         return PrimitiveTypeName.BINARY;
@@ -282,7 +280,8 @@ public class ParquetMetadataConverter {
     }
   }
 
-  Type getType(PrimitiveTypeName type) {
+  // Visible for testing
+  static Type getType(PrimitiveTypeName type) {
     switch (type) {
       case INT64:
         return Type.INT64;
@@ -305,7 +304,8 @@ public class ParquetMetadataConverter {
     }
   }
 
-  OriginalType getOriginalType(ConvertedType type) {
+  // Visible for testing
+  static OriginalType getOriginalType(ConvertedType type) {
     switch (type) {
       case UTF8:
         return OriginalType.UTF8;
@@ -352,7 +352,8 @@ public class ParquetMetadataConverter {
     }
   }
 
-  ConvertedType getConvertedType(OriginalType type) {
+  // Visible for testing
+  static ConvertedType getConvertedType(OriginalType type) {
     switch (type) {
       case UTF8:
         return ConvertedType.UTF8;
@@ -399,7 +400,7 @@ public class ParquetMetadataConverter {
      }
    }
 
-  private void addKeyValue(FileMetaData fileMetaData, String key, String value) {
+  private static void addKeyValue(FileMetaData fileMetaData, String key, String value) {
     KeyValue keyValue = new KeyValue(key);
     keyValue.value = value;
     fileMetaData.addToKey_value_metadata(keyValue);
@@ -415,15 +416,13 @@ public class ParquetMetadataConverter {
     private MetadataFilter() {}
     abstract <T, E extends Throwable> T accept(MetadataFilterVisitor<T, E> visitor)
throws E;
   }
-  public static final MetadataFilter NO_FILTER = new NoFilter();
-  public static final MetadataFilter SKIP_ROW_GROUPS = new SkipMetadataFilter();
   /**
    * [ startOffset, endOffset )
    * @param startOffset
    * @param endOffset
    * @return the filter
    */
-  public static final MetadataFilter range(long startOffset, long endOffset) {
+  public static MetadataFilter range(long startOffset, long endOffset) {
     return new RangeMetadataFilter(startOffset, endOffset);
   }
   private static final class NoFilter extends MetadataFilter {
@@ -452,6 +451,7 @@ public class ParquetMetadataConverter {
    * [ startOffset, endOffset )
    * @author Julien Le Dem
    */
+  // Visible for testing
   static final class RangeMetadataFilter extends MetadataFilter {
     final long startOffset;
     final long endOffset;
@@ -474,10 +474,11 @@ public class ParquetMetadataConverter {
   }
 
   @Deprecated
-  public ParquetMetadata readParquetMetadata(InputStream from) throws IOException {
+  public static ParquetMetadata readParquetMetadata(InputStream from) throws IOException
{
     return readParquetMetadata(from, NO_FILTER);
   }
 
+  // Visible for testing
   static FileMetaData filterFileMetaData(FileMetaData metaData, RangeMetadataFilter filter)
{
     List<RowGroup> rowGroups = metaData.getRow_groups();
     List<RowGroup> newRowGroups = new ArrayList<RowGroup>();
@@ -496,9 +497,11 @@ public class ParquetMetadataConverter {
     return metaData;
   }
 
+  // Visible for testing
   static long getOffset(RowGroup rowGroup) {
     return getOffset(rowGroup.getColumns().get(0));
   }
+  // Visible for testing
   static long getOffset(ColumnChunk columnChunk) {
     ColumnMetaData md = columnChunk.getMeta_data();
     long offset = md.getData_page_offset();
@@ -508,7 +511,7 @@ public class ParquetMetadataConverter {
     return offset;
   }
 
-  public ParquetMetadata readParquetMetadata(final InputStream from, MetadataFilter filter)
throws IOException {
+  public static ParquetMetadata readParquetMetadata(final InputStream from, MetadataFilter
filter) throws IOException {
     FileMetaData fileMetaData = filter.accept(new MetadataFilterVisitor<FileMetaData,
IOException>() {
       @Override
       public FileMetaData visit(NoFilter filter) throws IOException {
@@ -529,7 +532,7 @@ public class ParquetMetadataConverter {
     return parquetMetadata;
   }
 
-  public ParquetMetadata fromParquetMetadata(FileMetaData parquetMetadata) throws IOException
{
+  public static ParquetMetadata fromParquetMetadata(FileMetaData parquetMetadata) throws
IOException {
     MessageType messageType = fromParquetSchema(parquetMetadata.getSchema());
     List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
     List<RowGroup> row_groups = parquetMetadata.getRow_groups();
@@ -579,12 +582,13 @@ public class ParquetMetadataConverter {
         blocks);
   }
 
-  private ColumnPath getPath(ColumnMetaData metaData) {
+  private static ColumnPath getPath(ColumnMetaData metaData) {
     String[] path = metaData.path_in_schema.toArray(new String[metaData.path_in_schema.size()]);
     return ColumnPath.get(path);
   }
 
-  MessageType fromParquetSchema(List<SchemaElement> schema) {
+  // Visible for testing
+  static MessageType fromParquetSchema(List<SchemaElement> schema) {
     Iterator<SchemaElement> iterator = schema.iterator();
     SchemaElement root = iterator.next();
     Types.MessageTypeBuilder builder = Types.buildMessage();
@@ -592,7 +596,7 @@ public class ParquetMetadataConverter {
     return builder.named(root.name);
   }
 
-  private void buildChildren(Types.GroupBuilder builder,
+  private static void buildChildren(Types.GroupBuilder builder,
                              Iterator<SchemaElement> schema,
                              int childrenCount) {
     for (int i = 0; i < childrenCount; i++) {
@@ -631,16 +635,18 @@ public class ParquetMetadataConverter {
     }
   }
 
-  FieldRepetitionType toParquetRepetition(Repetition repetition) {
+  // Visible for testing
+  static FieldRepetitionType toParquetRepetition(Repetition repetition) {
     return FieldRepetitionType.valueOf(repetition.name());
   }
 
-  Repetition fromParquetRepetition(FieldRepetitionType repetition) {
+  // Visible for testing
+  static Repetition fromParquetRepetition(FieldRepetitionType repetition) {
     return Repetition.valueOf(repetition.name());
   }
 
   @Deprecated
-  public void writeDataPageHeader(
+  public static void writeDataPageHeader(
       int uncompressedSize,
       int compressedSize,
       int valueCount,
@@ -657,7 +663,7 @@ public class ParquetMetadataConverter {
                                       valuesEncoding), to);
   }
 
-  public void writeDataPageHeader(
+  public static void writeDataPageHeader(
       int uncompressedSize,
       int compressedSize,
       int valueCount,
@@ -669,7 +675,7 @@ public class ParquetMetadataConverter {
     writePageHeader(newDataPageHeader(uncompressedSize, compressedSize, valueCount, statistics,
rlEncoding, dlEncoding, valuesEncoding), to);
   }
 
-  private PageHeader newDataPageHeader(
+  private static PageHeader newDataPageHeader(
       int uncompressedSize, int compressedSize,
       int valueCount,
       org.apache.parquet.column.statistics.Statistics statistics,
@@ -689,7 +695,7 @@ public class ParquetMetadataConverter {
     return pageHeader;
   }
 
-  public void writeDataPageV2Header(
+  public static void writeDataPageV2Header(
       int uncompressedSize, int compressedSize,
       int valueCount, int nullCount, int rowCount,
       org.apache.parquet.column.statistics.Statistics statistics,
@@ -705,7 +711,7 @@ public class ParquetMetadataConverter {
             rlByteLength, dlByteLength), to);
   }
 
-  private PageHeader newDataPageV2Header(
+  private static PageHeader newDataPageV2Header(
       int uncompressedSize, int compressedSize,
       int valueCount, int nullCount, int rowCount,
       org.apache.parquet.column.statistics.Statistics<?> statistics,
@@ -724,7 +730,7 @@ public class ParquetMetadataConverter {
     return pageHeader;
   }
 
-  public void writeDictionaryPageHeader(
+  public static void writeDictionaryPageHeader(
       int uncompressedSize, int compressedSize, int valueCount,
       org.apache.parquet.column.Encoding valuesEncoding, OutputStream to) throws IOException
{
     PageHeader pageHeader = new PageHeader(PageType.DICTIONARY_PAGE, uncompressedSize, compressedSize);

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/5c2ba72f/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java
b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java
index 0a0b316..c90ba8a 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java
@@ -46,8 +46,6 @@ import org.apache.parquet.schema.MessageType;
 class ColumnChunkPageWriteStore implements PageWriteStore {
   private static final Log LOG = Log.getLog(ColumnChunkPageWriteStore.class);
 
-  private static ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();
-
   private static final class ColumnChunkPageWriter implements PageWriter {
 
     private final ColumnDescriptor path;
@@ -94,7 +92,7 @@ class ColumnChunkPageWriteStore implements PageWriteStore {
             + compressedSize);
       }
       tempOutputStream.reset();
-      parquetMetadataConverter.writeDataPageHeader(
+      ParquetMetadataConverter.writeDataPageHeader(
           (int)uncompressedSize,
           (int)compressedSize,
           valueCount,
@@ -133,7 +131,7 @@ class ColumnChunkPageWriteStore implements PageWriteStore {
           compressedData.size() + repetitionLevels.size() + definitionLevels.size()
       );
       tempOutputStream.reset();
-      parquetMetadataConverter.writeDataPageV2Header(
+      ParquetMetadataConverter.writeDataPageV2Header(
           uncompressedSize, compressedSize,
           valueCount, nullCount, rowCount,
           statistics,

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/5c2ba72f/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java
b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java
index 49e0833..7f97b19 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java
@@ -88,9 +88,7 @@ public class ParquetFileReader implements Closeable {
 
   private static final Log LOG = Log.getLog(ParquetFileReader.class);
 
-  public static String PARQUET_READ_PARALLELISM = "parquet.metadata.read.parallelism";
-
-  private static ParquetMetadataConverter converter = new ParquetMetadataConverter();
+  public static final String PARQUET_READ_PARALLELISM = "parquet.metadata.read.parallelism";
 
   /**
    * for files provided, check if there's a summary file.
@@ -427,7 +425,7 @@ public class ParquetFileReader implements Closeable {
         throw new RuntimeException("corrupted file: the footer index is not within the file");
       }
       f.seek(footerIndex);
-      return converter.readParquetMetadata(f, filter);
+      return ParquetMetadataConverter.readParquetMetadata(f, filter);
     } finally {
       f.close();
     }
@@ -558,7 +556,7 @@ public class ParquetFileReader implements Closeable {
                     this.readAsBytesInput(compressedPageSize),
                     uncompressedPageSize,
                     dicHeader.getNum_values(),
-                    converter.getEncoding(dicHeader.getEncoding())
+                    ParquetMetadataConverter.getEncoding(dicHeader.getEncoding())
                     );
             break;
           case DATA_PAGE:
@@ -569,9 +567,9 @@ public class ParquetFileReader implements Closeable {
                     dataHeaderV1.getNum_values(),
                     uncompressedPageSize,
                     fromParquetStatistics(dataHeaderV1.getStatistics(), descriptor.col.getType()),
-                    converter.getEncoding(dataHeaderV1.getRepetition_level_encoding()),
-                    converter.getEncoding(dataHeaderV1.getDefinition_level_encoding()),
-                    converter.getEncoding(dataHeaderV1.getEncoding())
+                    ParquetMetadataConverter.getEncoding(dataHeaderV1.getRepetition_level_encoding()),
+                    ParquetMetadataConverter.getEncoding(dataHeaderV1.getDefinition_level_encoding()),
+                    ParquetMetadataConverter.getEncoding(dataHeaderV1.getEncoding())
                     ));
             valuesCountReadSoFar += dataHeaderV1.getNum_values();
             break;
@@ -585,7 +583,7 @@ public class ParquetFileReader implements Closeable {
                     dataHeaderV2.getNum_values(),
                     this.readAsBytesInput(dataHeaderV2.getRepetition_levels_byte_length()),
                     this.readAsBytesInput(dataHeaderV2.getDefinition_levels_byte_length()),
-                    converter.getEncoding(dataHeaderV2.getEncoding()),
+                    ParquetMetadataConverter.getEncoding(dataHeaderV2.getEncoding()),
                     this.readAsBytesInput(dataSize),
                     uncompressedPageSize,
                     fromParquetStatistics(dataHeaderV2.getStatistics(), descriptor.col.getType()),

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/5c2ba72f/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java
b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java
index 65423b5..b878179 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java
@@ -94,8 +94,6 @@ public class ParquetFileWriter {
     OVERWRITE
   }
 
-  private static final ParquetMetadataConverter metadataConverter = new ParquetMetadataConverter();
-
   private final MessageType schema;
   private final FSDataOutputStream out;
   private final AlignmentStrategy alignment;
@@ -304,7 +302,7 @@ public class ParquetFileWriter {
     currentChunkDictionaryPageOffset = out.getPos();
     int uncompressedSize = dictionaryPage.getUncompressedSize();
     int compressedPageSize = (int)dictionaryPage.getBytes().size(); // TODO: fix casts
-    metadataConverter.writeDictionaryPageHeader(
+    ParquetMetadataConverter.writeDictionaryPageHeader(
         uncompressedSize,
         compressedPageSize,
         dictionaryPage.getDictionarySize(),
@@ -339,7 +337,7 @@ public class ParquetFileWriter {
     long beforeHeader = out.getPos();
     if (DEBUG) LOG.debug(beforeHeader + ": write data page: " + valueCount + " values");
     int compressedPageSize = (int)bytes.size();
-    metadataConverter.writeDataPageHeader(
+    ParquetMetadataConverter.writeDataPageHeader(
         uncompressedPageSize, compressedPageSize,
         valueCount,
         rlEncoding,
@@ -376,7 +374,7 @@ public class ParquetFileWriter {
     long beforeHeader = out.getPos();
     if (DEBUG) LOG.debug(beforeHeader + ": write data page: " + valueCount + " values");
     int compressedPageSize = (int)bytes.size();
-    metadataConverter.writeDataPageHeader(
+    ParquetMetadataConverter.writeDataPageHeader(
         uncompressedPageSize, compressedPageSize,
         valueCount,
         statistics,
@@ -469,7 +467,7 @@ public class ParquetFileWriter {
 
   private static void serializeFooter(ParquetMetadata footer, FSDataOutputStream out) throws
IOException {
     long footerIndex = out.getPos();
-    org.apache.parquet.format.FileMetaData parquetMetadata = new ParquetMetadataConverter().toParquetMetadata(CURRENT_VERSION,
footer);
+    org.apache.parquet.format.FileMetaData parquetMetadata = ParquetMetadataConverter.toParquetMetadata(CURRENT_VERSION,
footer);
     writeFileMetaData(parquetMetadata, out);
     if (DEBUG) LOG.debug(out.getPos() + ": footer length = " + (out.getPos() - footerIndex));
     BytesUtils.writeIntLittleEndian(out, (int)(out.getPos() - footerIndex));

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/5c2ba72f/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
index 1a740fe..dd4aba9 100644
--- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
+++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
@@ -21,6 +21,7 @@ package org.apache.parquet.format.converter;
 import static java.util.Collections.emptyList;
 import static org.apache.parquet.schema.MessageTypeParser.parseMessageType;
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertSame;
 import static org.junit.Assert.fail;
 import static org.apache.parquet.format.CompressionCodec.UNCOMPRESSED;
 import static org.apache.parquet.format.Type.INT32;
@@ -51,7 +52,6 @@ import org.apache.parquet.hadoop.metadata.ParquetMetadata;
 import org.junit.Assert;
 import org.junit.Test;
 
-import org.apache.parquet.column.Encoding;
 import org.apache.parquet.example.Paper;
 import org.apache.parquet.format.ColumnChunk;
 import org.apache.parquet.format.ColumnMetaData;
@@ -87,16 +87,14 @@ public class TestParquetMetadataConverter {
 
   @Test
   public void testSchemaConverter() {
-    ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();
-    List<SchemaElement> parquetSchema = parquetMetadataConverter.toParquetSchema(Paper.schema);
-    MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema);
+    List<SchemaElement> parquetSchema = ParquetMetadataConverter.toParquetSchema(Paper.schema);
+    MessageType schema = ParquetMetadataConverter.fromParquetSchema(parquetSchema);
     assertEquals(Paper.schema, schema);
   }
 
   @Test
   public void testSchemaConverterDecimal() {
-    ParquetMetadataConverter converter = new ParquetMetadataConverter();
-    List<SchemaElement> schemaElements = converter.toParquetSchema(
+    List<SchemaElement> schemaElements = ParquetMetadataConverter.toParquetSchema(
         Types.buildMessage()
             .required(PrimitiveTypeName.BINARY)
                 .as(OriginalType.DECIMAL).precision(9).scale(2)
@@ -125,30 +123,29 @@ public class TestParquetMetadataConverter {
 
   @Test
   public void testEnumEquivalence() {
-    ParquetMetadataConverter c = new ParquetMetadataConverter();
-    for (Encoding encoding : Encoding.values()) {
-      assertEquals(encoding, c.getEncoding(c.getEncoding(encoding)));
+    for (org.apache.parquet.column.Encoding encoding : org.apache.parquet.column.Encoding.values())
{
+      assertEquals(encoding, ParquetMetadataConverter.getEncoding(ParquetMetadataConverter.getEncoding(encoding)));
     }
     for (org.apache.parquet.format.Encoding encoding : org.apache.parquet.format.Encoding.values())
{
-      assertEquals(encoding, c.getEncoding(c.getEncoding(encoding)));
+      assertEquals(encoding, ParquetMetadataConverter.getEncoding(ParquetMetadataConverter.getEncoding(encoding)));
     }
     for (Repetition repetition : Repetition.values()) {
-      assertEquals(repetition, c.fromParquetRepetition(c.toParquetRepetition(repetition)));
+      assertEquals(repetition, ParquetMetadataConverter.fromParquetRepetition(ParquetMetadataConverter.toParquetRepetition(repetition)));
     }
     for (FieldRepetitionType repetition : FieldRepetitionType.values()) {
-      assertEquals(repetition, c.toParquetRepetition(c.fromParquetRepetition(repetition)));
+      assertEquals(repetition, ParquetMetadataConverter.toParquetRepetition(ParquetMetadataConverter.fromParquetRepetition(repetition)));
     }
     for (PrimitiveTypeName primitiveTypeName : PrimitiveTypeName.values()) {
-      assertEquals(primitiveTypeName, c.getPrimitive(c.getType(primitiveTypeName)));
+      assertEquals(primitiveTypeName, ParquetMetadataConverter.getPrimitive(ParquetMetadataConverter.getType(primitiveTypeName)));
     }
     for (Type type : Type.values()) {
-      assertEquals(type, c.getType(c.getPrimitive(type)));
+      assertEquals(type, ParquetMetadataConverter.getType(ParquetMetadataConverter.getPrimitive(type)));
     }
     for (OriginalType original : OriginalType.values()) {
-      assertEquals(original, c.getOriginalType(c.getConvertedType(original)));
+      assertEquals(original, ParquetMetadataConverter.getOriginalType(ParquetMetadataConverter.getConvertedType(original)));
     }
     for (ConvertedType converted : ConvertedType.values()) {
-      assertEquals(converted, c.getConvertedType(c.getOriginalType(converted)));
+      assertEquals(converted, ParquetMetadataConverter.getConvertedType(ParquetMetadataConverter.getOriginalType(converted)));
     }
   }
 
@@ -274,7 +271,7 @@ public class TestParquetMetadataConverter {
   }
 
   private ColumnChunkMetaData createColumnChunkMetaData() {
-    Set<Encoding> e = new HashSet<Encoding>();
+    Set<org.apache.parquet.column.Encoding> e = new HashSet<org.apache.parquet.column.Encoding>();
     PrimitiveTypeName t = PrimitiveTypeName.BINARY;
     ColumnPath p = ColumnPath.get("foo");
     CompressionCodecName c = CompressionCodecName.GZIP;
@@ -283,4 +280,40 @@ public class TestParquetMetadataConverter {
             0, 0, 0, 0, 0);
     return md;
   }
+  
+  @Test
+  public void testEncodingsCache() {
+    List<org.apache.parquet.format.Encoding> formatEncodingsCopy1 =
+        Arrays.asList(org.apache.parquet.format.Encoding.BIT_PACKED,
+                      org.apache.parquet.format.Encoding.RLE_DICTIONARY,
+                      org.apache.parquet.format.Encoding.DELTA_LENGTH_BYTE_ARRAY);
+
+    List<org.apache.parquet.format.Encoding> formatEncodingsCopy2 =
+        Arrays.asList(org.apache.parquet.format.Encoding.BIT_PACKED,
+            org.apache.parquet.format.Encoding.RLE_DICTIONARY,
+            org.apache.parquet.format.Encoding.DELTA_LENGTH_BYTE_ARRAY);
+
+    Set<org.apache.parquet.column.Encoding> expected = new HashSet<org.apache.parquet.column.Encoding>();
+    expected.add(org.apache.parquet.column.Encoding.BIT_PACKED);
+    expected.add(org.apache.parquet.column.Encoding.RLE_DICTIONARY);
+    expected.add(org.apache.parquet.column.Encoding.DELTA_LENGTH_BYTE_ARRAY);
+
+    Set<org.apache.parquet.column.Encoding> res1 = ParquetMetadataConverter.fromFormatEncodings(formatEncodingsCopy1);
+    Set<org.apache.parquet.column.Encoding> res2 = ParquetMetadataConverter.fromFormatEncodings(formatEncodingsCopy1);
+    Set<org.apache.parquet.column.Encoding> res3 = ParquetMetadataConverter.fromFormatEncodings(formatEncodingsCopy2);
+
+    // make sure they are all semantically equal
+    assertEquals(expected, res1);
+    assertEquals(expected, res2);
+    assertEquals(expected, res3);
+
+    // make sure res1, res2, and res3 are actually the same cached object
+    assertSame(res1, res2);
+    assertSame(res1, res3);
+
+    // make sure they are all unmodifiable (UnmodifiableSet is not public, so we have to
compare on class name)
+    assertEquals("java.util.Collections$UnmodifiableSet", res1.getClass().getName());
+    assertEquals("java.util.Collections$UnmodifiableSet", res2.getClass().getName());
+    assertEquals("java.util.Collections$UnmodifiableSet", res3.getClass().getName());
+  }  
 }


Mime
View raw message