Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 03DB8200C24 for ; Thu, 19 Jan 2017 02:27:19 +0100 (CET) Received: by cust-asf.ponee.io (Postfix) id 02636160B43; Thu, 19 Jan 2017 01:27:19 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id B94B3160B60 for ; Thu, 19 Jan 2017 02:27:17 +0100 (CET) Received: (qmail 54697 invoked by uid 500); 19 Jan 2017 01:27:16 -0000 Mailing-List: contact commits-help@parquet.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@parquet.apache.org Delivered-To: mailing list commits@parquet.apache.org Received: (qmail 52974 invoked by uid 99); 19 Jan 2017 01:27:13 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 19 Jan 2017 01:27:13 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 68B44F403A; Thu, 19 Jan 2017 01:27:13 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: blue@apache.org To: commits@parquet.apache.org Date: Thu, 19 Jan 2017 01:27:54 -0000 Message-Id: In-Reply-To: <25406da3dfe343a9a44d6bc62fd223d9@git.apache.org> References: <25406da3dfe343a9a44d6bc62fd223d9@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [43/50] [abbrv] parquet-mr git commit: PARQUET-669: allow reading footers from provided file listing and streams archived-at: Thu, 19 Jan 2017 01:27:19 -0000 PARQUET-669: allow reading footers from provided file listing and streams The use case is that I want to reuse existing listing of files and avoid doing it again when opening streams. This is in case where filesystem.open is expensive but you have other means of obtaining input stream for a file. Author: Robert Kruszewski Closes #357 from robert3005/robertk/allow-reading-footers-from-streams and squashes the following commits: 4d8a54c [Robert Kruszewski] allow reading footers from provided file listing and streams Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/f8489499 Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/f8489499 Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/f8489499 Branch: refs/heads/parquet-1.8.x Commit: f8489499aa6d460a7548b14d516638b0bd7b862b Parents: aced0eb Author: Robert Kruszewski Authored: Wed Aug 3 14:22:27 2016 -0700 Committer: Ryan Blue Committed: Mon Jan 9 16:54:54 2017 -0800 ---------------------------------------------------------------------- .../parquet/hadoop/ParquetFileReader.java | 28 +++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/f8489499/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java ---------------------------------------------------------------------- diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java index 2d1c62b..d018835 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java @@ -417,22 +417,30 @@ public class ParquetFileReader implements Closeable { FileSystem fileSystem = file.getPath().getFileSystem(configuration); FSDataInputStream in = fileSystem.open(file.getPath()); try { - return readFooter(file, in, filter); + return readFooter(file.getLen(), file.getPath().toString(), in, filter); } finally { in.close(); } } - private static final ParquetMetadata readFooter(FileStatus file, FSDataInputStream f, MetadataFilter filter) throws IOException { - long l = file.getLen(); + /** + * Reads the meta data block in the footer of the file using provided input stream + * @param fileLen length of the file + * @param filePath file location + * @param f input stream for the file + * @param filter the filter to apply to row groups + * @return the metadata blocks in the footer + * @throws IOException if an error occurs while reading the file + */ + public static final ParquetMetadata readFooter(long fileLen, String filePath, FSDataInputStream f, MetadataFilter filter) throws IOException { if (Log.DEBUG) { - LOG.debug("File length " + l); + LOG.debug("File length " + fileLen); } int FOOTER_LENGTH_SIZE = 4; - if (l < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) { // MAGIC + data + footer + footerIndex + MAGIC - throw new RuntimeException(file.getPath() + " is not a Parquet file (too small)"); + if (fileLen < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) { // MAGIC + data + footer + footerIndex + MAGIC + throw new RuntimeException(filePath + " is not a Parquet file (too small)"); } - long footerLengthIndex = l - FOOTER_LENGTH_SIZE - MAGIC.length; + long footerLengthIndex = fileLen - FOOTER_LENGTH_SIZE - MAGIC.length; if (Log.DEBUG) { LOG.debug("reading footer index at " + footerLengthIndex); } @@ -442,7 +450,7 @@ public class ParquetFileReader implements Closeable { byte[] magic = new byte[MAGIC.length]; f.readFully(magic); if (!Arrays.equals(MAGIC, magic)) { - throw new RuntimeException(file.getPath() + " is not a Parquet file. expected magic number at tail " + Arrays.toString(MAGIC) + " but found " + Arrays.toString(magic)); + throw new RuntimeException(filePath + " is not a Parquet file. expected magic number at tail " + Arrays.toString(MAGIC) + " but found " + Arrays.toString(magic)); } long footerIndex = footerLengthIndex - footerLength; if (Log.DEBUG) { @@ -534,7 +542,7 @@ public class ParquetFileReader implements Closeable { FileSystem fs = file.getFileSystem(conf); this.fileStatus = fs.getFileStatus(file); this.f = fs.open(file); - this.footer = readFooter(fileStatus, f, filter); + this.footer = readFooter(fileStatus.getLen(), fileStatus.getPath().toString(), f, filter); this.fileMetaData = footer.getFileMetaData(); this.blocks = footer.getBlocks(); for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) { @@ -571,7 +579,7 @@ public class ParquetFileReader implements Closeable { if (footer == null) { try { // don't read the row groups because this.blocks is always set - this.footer = readFooter(fileStatus, f, SKIP_ROW_GROUPS); + this.footer = readFooter(fileStatus.getLen(), fileStatus.getPath().toString(), f, SKIP_ROW_GROUPS); } catch (IOException e) { throw new ParquetDecodingException("Unable to read file footer", e); }