Return-Path: X-Original-To: apmail-parquet-commits-archive@minotaur.apache.org Delivered-To: apmail-parquet-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 51E2618794 for ; Sat, 23 Apr 2016 00:42:39 +0000 (UTC) Received: (qmail 54309 invoked by uid 500); 23 Apr 2016 00:42:39 -0000 Delivered-To: apmail-parquet-commits-archive@parquet.apache.org Received: (qmail 54279 invoked by uid 500); 23 Apr 2016 00:42:39 -0000 Mailing-List: contact commits-help@parquet.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@parquet.apache.org Delivered-To: mailing list commits@parquet.apache.org Received: (qmail 54269 invoked by uid 99); 23 Apr 2016 00:42:39 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Sat, 23 Apr 2016 00:42:39 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 1EDFCDFCDF; Sat, 23 Apr 2016 00:42:39 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: blue@apache.org To: commits@parquet.apache.org Message-Id: X-Mailer: ASF-Git Admin Mailer Subject: parquet-mr git commit: PARQUET-569: Separate metadata filtering for ranges and offsets. Date: Sat, 23 Apr 2016 00:42:39 +0000 (UTC) Repository: parquet-mr Updated Branches: refs/heads/master 3dd2210e7 -> 2f22533ef PARQUET-569: Separate metadata filtering for ranges and offsets. Range filtering should use the row group midpoint and offset filtering should use the start offset. Author: Ryan Blue Closes #337 from rdblue/PARQUET-569-fix-metadata-filter and squashes the following commits: 6171af4 [Ryan Blue] PARQUET-569: Add tests for new offset metadata filter. 3fe2d5e [Ryan Blue] PARQUET-569: Separate metadata filtering for ranges and offsets. Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/2f22533e Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/2f22533e Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/2f22533e Branch: refs/heads/master Commit: 2f22533ef41533e2b839a6b41b262dca59e6dbf9 Parents: 3dd2210 Author: Ryan Blue Authored: Fri Apr 22 17:42:35 2016 -0700 Committer: Ryan Blue Committed: Fri Apr 22 17:42:35 2016 -0700 ---------------------------------------------------------------------- .../converter/ParquetMetadataConverter.java | 36 ++++++++++++++------ .../converter/TestParquetMetadataConverter.java | 31 +++++++++++++++-- 2 files changed, 54 insertions(+), 13 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/2f22533e/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java ---------------------------------------------------------------------- diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 6feb4a2..75b07fd 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -478,6 +478,7 @@ public class ParquetMetadataConverter { private static interface MetadataFilterVisitor { T visit(NoFilter filter) throws E; T visit(SkipMetadataFilter filter) throws E; + T visit(RangeMetadataFilter filter) throws E; T visit(OffsetMetadataFilter filter) throws E; } @@ -501,7 +502,7 @@ public class ParquetMetadataConverter { for (long offset : offsets) { set.add(offset); } - return new OffsetListMetadataFilter(set); + return new OffsetMetadataFilter(set); } private static final class NoFilter extends MetadataFilter { @@ -527,16 +528,12 @@ public class ParquetMetadataConverter { } } - interface OffsetMetadataFilter { - boolean contains(long offset); - } - /** * [ startOffset, endOffset ) * @author Julien Le Dem */ // Visible for testing - static final class RangeMetadataFilter extends MetadataFilter implements OffsetMetadataFilter { + static final class RangeMetadataFilter extends MetadataFilter { final long startOffset; final long endOffset; @@ -551,7 +548,6 @@ public class ParquetMetadataConverter { return visitor.visit(this); } - @Override public boolean contains(long offset) { return offset >= this.startOffset && offset < this.endOffset; } @@ -562,10 +558,10 @@ public class ParquetMetadataConverter { } } - static final class OffsetListMetadataFilter extends MetadataFilter implements OffsetMetadataFilter { + static final class OffsetMetadataFilter extends MetadataFilter { private final Set offsets; - public OffsetListMetadataFilter(Set offsets) { + public OffsetMetadataFilter(Set offsets) { this.offsets = offsets; } @@ -585,7 +581,7 @@ public class ParquetMetadataConverter { } // Visible for testing - static FileMetaData filterFileMetaData(FileMetaData metaData, OffsetMetadataFilter filter) { + static FileMetaData filterFileMetaDataByMidpoint(FileMetaData metaData, RangeMetadataFilter filter) { List rowGroups = metaData.getRow_groups(); List newRowGroups = new ArrayList(); for (RowGroup rowGroup : rowGroups) { @@ -604,6 +600,19 @@ public class ParquetMetadataConverter { } // Visible for testing + static FileMetaData filterFileMetaDataByStart(FileMetaData metaData, OffsetMetadataFilter filter) { + List rowGroups = metaData.getRow_groups(); + List newRowGroups = new ArrayList(); + for (RowGroup rowGroup : rowGroups) { + long startIndex = getOffset(rowGroup.getColumns().get(0)); + if (filter.contains(startIndex)) { + newRowGroups.add(rowGroup); + } + } + metaData.setRow_groups(newRowGroups); + return metaData; + } + static long getOffset(RowGroup rowGroup) { return getOffset(rowGroup.getColumns().get(0)); } @@ -631,7 +640,12 @@ public class ParquetMetadataConverter { @Override public FileMetaData visit(OffsetMetadataFilter filter) throws IOException { - return filterFileMetaData(readFileMetaData(from), filter); + return filterFileMetaDataByStart(readFileMetaData(from), filter); + } + + @Override + public FileMetaData visit(RangeMetadataFilter filter) throws IOException { + return filterFileMetaDataByMidpoint(readFileMetaData(from), filter); } }); if (Log.DEBUG) LOG.debug(fileMetaData); http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/2f22533e/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java ---------------------------------------------------------------------- diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index eb109c0..b9cfde7 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -19,6 +19,7 @@ package org.apache.parquet.format.converter; import static java.util.Collections.emptyList; +import static org.apache.parquet.format.converter.ParquetMetadataConverter.filterFileMetaDataByStart; import static org.apache.parquet.schema.MessageTypeParser.parseMessageType; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertSame; @@ -27,7 +28,7 @@ import static org.apache.parquet.format.CompressionCodec.UNCOMPRESSED; import static org.apache.parquet.format.Type.INT32; import static org.apache.parquet.format.Util.readPageHeader; import static org.apache.parquet.format.Util.writePageHeader; -import static org.apache.parquet.format.converter.ParquetMetadataConverter.filterFileMetaData; +import static org.apache.parquet.format.converter.ParquetMetadataConverter.filterFileMetaDataByMidpoint; import static org.apache.parquet.format.converter.ParquetMetadataConverter.getOffset; import java.io.ByteArrayInputStream; @@ -43,6 +44,7 @@ import java.util.Random; import java.util.Set; import java.util.TreeSet; +import com.google.common.collect.Sets; import org.apache.parquet.column.statistics.BinaryStatistics; import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; @@ -170,7 +172,20 @@ public class TestParquetMetadataConverter { } private FileMetaData filter(FileMetaData md, long start, long end) { - return filterFileMetaData(new FileMetaData(md), new ParquetMetadataConverter.RangeMetadataFilter(start, end)); + return filterFileMetaDataByMidpoint(new FileMetaData(md), + new ParquetMetadataConverter.RangeMetadataFilter(start, end)); + } + + private FileMetaData find(FileMetaData md, Long... blockStart) { + return filterFileMetaDataByStart(new FileMetaData(md), + new ParquetMetadataConverter.OffsetMetadataFilter( + Sets.newHashSet((Long[]) blockStart))); + } + + private FileMetaData find(FileMetaData md, long blockStart) { + return filterFileMetaDataByStart(new FileMetaData(md), + new ParquetMetadataConverter.OffsetMetadataFilter( + Sets.newHashSet(blockStart))); } private void verifyMD(FileMetaData md, long... offsets) { @@ -243,6 +258,18 @@ public class TestParquetMetadataConverter { } @Test + public void testFindRowGroups() { + verifyMD(find(metadata(50, 50, 50), 0), 0); + verifyMD(find(metadata(50, 50, 50), 50), 50); + verifyMD(find(metadata(50, 50, 50), 100), 100); + verifyMD(find(metadata(50, 50, 50), 0L, 50L), 0, 50); + verifyMD(find(metadata(50, 50, 50), 0L, 50L, 100L), 0, 50, 100); + verifyMD(find(metadata(50, 50, 50), 50L, 100L), 50, 100); + // doesn't find an offset that isn't the start of a row group. + verifyMD(find(metadata(50, 50, 50), 10)); + } + + @Test public void randomTestFilterMetaData() { // randomized property based testing // if it fails add the case above