parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From b...@apache.org
Subject incubator-parquet-mr git commit: PARQUET-142: add path filter in ParquetReader
Date Fri, 30 Jan 2015 01:31:07 GMT
Repository: incubator-parquet-mr
Updated Branches:
  refs/heads/master e505e1fea -> b4380f200


PARQUET-142: add path filter in ParquetReader

Currently parquet-tools command fails when input is a directory with _SUCCESS file from mapreduce.
Filtering those out like ParquetFileReader does fixes the problem.

```
parquet-cat /tmp/parquet_write_test
Could not read footer: java.lang.RuntimeException: file:/tmp/parquet_write_test/_SUCCESS is
not a Parquet file (too small)

$ tree /tmp/parquet_write_test
/tmp/parquet_write_test
├── part-m-00000.parquet
└── _SUCCESS
```

Author: Neville Li <neville@spotify.com>

Closes #89 from nevillelyh/gh/path-filter and squashes the following commits:

7377a20 [Neville Li] PARQUET-142: add path filter in ParquetReader


Project: http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/commit/b4380f20
Tree: http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/tree/b4380f20
Diff: http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/diff/b4380f20

Branch: refs/heads/master
Commit: b4380f20059dc9e4ccfe2b709587e8069ac0fa34
Parents: e505e1f
Author: Neville Li <neville@spotify.com>
Authored: Thu Jan 29 17:31:04 2015 -0800
Committer: Ryan Blue <blue@apache.org>
Committed: Thu Jan 29 17:31:04 2015 -0800

----------------------------------------------------------------------
 .../src/main/java/parquet/hadoop/ParquetReader.java          | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/blob/b4380f20/parquet-hadoop/src/main/java/parquet/hadoop/ParquetReader.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/main/java/parquet/hadoop/ParquetReader.java b/parquet-hadoop/src/main/java/parquet/hadoop/ParquetReader.java
index ec839e2..4d80f0f 100644
--- a/parquet-hadoop/src/main/java/parquet/hadoop/ParquetReader.java
+++ b/parquet-hadoop/src/main/java/parquet/hadoop/ParquetReader.java
@@ -30,6 +30,7 @@ import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 
+import org.apache.hadoop.fs.PathFilter;
 import parquet.filter.UnboundRecordFilter;
 import parquet.filter2.compat.FilterCompat;
 import parquet.filter2.compat.FilterCompat.Filter;
@@ -113,7 +114,12 @@ public class ParquetReader<T> implements Closeable {
     this.conf = conf;
 
     FileSystem fs = file.getFileSystem(conf);
-    List<FileStatus> statuses = Arrays.asList(fs.listStatus(file));
+    List<FileStatus> statuses = Arrays.asList(fs.listStatus(file, new PathFilter()
{
+      @Override
+      public boolean accept(Path p) {
+        return !p.getName().startsWith("_") && !p.getName().startsWith(".");
+      }
+    }));
     List<Footer> footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf,
statuses, false);
     this.footersIterator = footers.iterator();
     globalMetaData = ParquetFileWriter.getGlobalMetaData(footers);


Mime
View raw message