parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From b...@apache.org
Subject [43/50] [abbrv] parquet-mr git commit: PARQUET-669: allow reading footers from provided file listing and streams
Date Thu, 19 Jan 2017 01:27:54 GMT
PARQUET-669: allow reading footers from provided file listing and streams

The use case is that I want to reuse existing listing of files and avoid doing it again when
opening streams. This is in case where filesystem.open is expensive but you have other means
of obtaining input stream for a file.

Author: Robert Kruszewski <robertk@palantir.com>

Closes #357 from robert3005/robertk/allow-reading-footers-from-streams and squashes the following
commits:

4d8a54c [Robert Kruszewski] allow reading footers from provided file listing and streams


Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/f8489499
Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/f8489499
Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/f8489499

Branch: refs/heads/parquet-1.8.x
Commit: f8489499aa6d460a7548b14d516638b0bd7b862b
Parents: aced0eb
Author: Robert Kruszewski <robertk@palantir.com>
Authored: Wed Aug 3 14:22:27 2016 -0700
Committer: Ryan Blue <blue@apache.org>
Committed: Mon Jan 9 16:54:54 2017 -0800

----------------------------------------------------------------------
 .../parquet/hadoop/ParquetFileReader.java       | 28 +++++++++++++-------
 1 file changed, 18 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/f8489499/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java
b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java
index 2d1c62b..d018835 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java
@@ -417,22 +417,30 @@ public class ParquetFileReader implements Closeable {
     FileSystem fileSystem = file.getPath().getFileSystem(configuration);
     FSDataInputStream in = fileSystem.open(file.getPath());
     try {
-      return readFooter(file, in, filter);
+      return readFooter(file.getLen(), file.getPath().toString(), in, filter);
     } finally {
       in.close();
     }
   }
 
-  private static final ParquetMetadata readFooter(FileStatus file, FSDataInputStream f, MetadataFilter
filter) throws IOException {
-    long l = file.getLen();
+  /**
+   * Reads the meta data block in the footer of the file using provided input stream
+   * @param fileLen length of the file
+   * @param filePath file location
+   * @param f input stream for the file
+   * @param filter the filter to apply to row groups
+   * @return the metadata blocks in the footer
+   * @throws IOException if an error occurs while reading the file
+   */
+  public static final ParquetMetadata readFooter(long fileLen, String filePath, FSDataInputStream
f, MetadataFilter filter) throws IOException {
     if (Log.DEBUG) {
-      LOG.debug("File length " + l);
+      LOG.debug("File length " + fileLen);
     }
     int FOOTER_LENGTH_SIZE = 4;
-    if (l < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) { // MAGIC + data + footer
+ footerIndex + MAGIC
-      throw new RuntimeException(file.getPath() + " is not a Parquet file (too small)");
+    if (fileLen < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) { // MAGIC + data
+ footer + footerIndex + MAGIC
+      throw new RuntimeException(filePath + " is not a Parquet file (too small)");
     }
-    long footerLengthIndex = l - FOOTER_LENGTH_SIZE - MAGIC.length;
+    long footerLengthIndex = fileLen - FOOTER_LENGTH_SIZE - MAGIC.length;
     if (Log.DEBUG) {
       LOG.debug("reading footer index at " + footerLengthIndex);
     }
@@ -442,7 +450,7 @@ public class ParquetFileReader implements Closeable {
     byte[] magic = new byte[MAGIC.length];
     f.readFully(magic);
     if (!Arrays.equals(MAGIC, magic)) {
-      throw new RuntimeException(file.getPath() + " is not a Parquet file. expected magic
number at tail " + Arrays.toString(MAGIC) + " but found " + Arrays.toString(magic));
+      throw new RuntimeException(filePath + " is not a Parquet file. expected magic number
at tail " + Arrays.toString(MAGIC) + " but found " + Arrays.toString(magic));
     }
     long footerIndex = footerLengthIndex - footerLength;
     if (Log.DEBUG) {
@@ -534,7 +542,7 @@ public class ParquetFileReader implements Closeable {
     FileSystem fs = file.getFileSystem(conf);
     this.fileStatus = fs.getFileStatus(file);
     this.f = fs.open(file);
-    this.footer = readFooter(fileStatus, f, filter);
+    this.footer = readFooter(fileStatus.getLen(), fileStatus.getPath().toString(), f, filter);
     this.fileMetaData = footer.getFileMetaData();
     this.blocks = footer.getBlocks();
     for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) {
@@ -571,7 +579,7 @@ public class ParquetFileReader implements Closeable {
     if (footer == null) {
       try {
         // don't read the row groups because this.blocks is always set
-        this.footer = readFooter(fileStatus, f, SKIP_ROW_GROUPS);
+        this.footer = readFooter(fileStatus.getLen(), fileStatus.getPath().toString(), f,
SKIP_ROW_GROUPS);
       } catch (IOException e) {
         throw new ParquetDecodingException("Unable to read file footer", e);
       }


Mime
View raw message