Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 55715200C0F for ; Thu, 19 Jan 2017 02:27:19 +0100 (CET) Received: by cust-asf.ponee.io (Postfix) id 54018160B43; Thu, 19 Jan 2017 01:27:19 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id B665E160B5E for ; Thu, 19 Jan 2017 02:27:17 +0100 (CET) Received: (qmail 54652 invoked by uid 500); 19 Jan 2017 01:27:16 -0000 Mailing-List: contact commits-help@parquet.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@parquet.apache.org Delivered-To: mailing list commits@parquet.apache.org Received: (qmail 52907 invoked by uid 99); 19 Jan 2017 01:27:13 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 19 Jan 2017 01:27:13 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 2AAD2F4034; Thu, 19 Jan 2017 01:27:13 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: blue@apache.org To: commits@parquet.apache.org Date: Thu, 19 Jan 2017 01:27:40 -0000 Message-Id: <973836cfefaa498a96b0ee00f08cd7d1@git.apache.org> In-Reply-To: <25406da3dfe343a9a44d6bc62fd223d9@git.apache.org> References: <25406da3dfe343a9a44d6bc62fd223d9@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [29/50] [abbrv] parquet-mr git commit: PARQUET-674: Add InputFile abstraction for openable files. archived-at: Thu, 19 Jan 2017 01:27:19 -0000 PARQUET-674: Add InputFile abstraction for openable files. Author: Ryan Blue Closes #368 from rdblue/PARQUET-674-add-data-source and squashes the following commits: 8c689e9 [Ryan Blue] PARQUET-674: Implement review comments. 4a7c327 [Ryan Blue] PARQUET-674: Add DataSource abstraction for openable files. Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/2990dea6 Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/2990dea6 Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/2990dea6 Branch: refs/heads/parquet-1.8.x Commit: 2990dea6eda8af82165cbdc85cffca81787ca8c3 Parents: 4beb060 Author: Ryan Blue Authored: Mon Oct 3 15:04:12 2016 -0700 Committer: Ryan Blue Committed: Mon Jan 9 16:54:54 2017 -0800 ---------------------------------------------------------------------- .../java/org/apache/parquet/io/InputFile.java | 43 +++++++++++++ .../parquet/hadoop/ParquetFileReader.java | 32 ++++++---- .../parquet/hadoop/util/HadoopInputFile.java | 66 ++++++++++++++++++++ 3 files changed, 130 insertions(+), 11 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/2990dea6/parquet-common/src/main/java/org/apache/parquet/io/InputFile.java ---------------------------------------------------------------------- diff --git a/parquet-common/src/main/java/org/apache/parquet/io/InputFile.java b/parquet-common/src/main/java/org/apache/parquet/io/InputFile.java new file mode 100644 index 0000000..e2c7cc0 --- /dev/null +++ b/parquet-common/src/main/java/org/apache/parquet/io/InputFile.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.io; + +import java.io.IOException; + +/** + * {@code InputFile} is an interface with the methods needed by Parquet to read + * data files using {@link SeekableInputStream} instances. + */ +public interface InputFile { + + /** + * Returns the total length of the file, in bytes. + * @throws IOException if the length cannot be determined + */ + long getLength() throws IOException; + + /** + * Opens a new {@link SeekableInputStream} for the underlying + * data file. + * @throws IOException if the stream cannot be opened. + */ + SeekableInputStream newStream() throws IOException; + +} http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/2990dea6/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java ---------------------------------------------------------------------- diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java index 8e315a5..24d17a0 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java @@ -85,11 +85,13 @@ import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; import org.apache.parquet.hadoop.metadata.FileMetaData; import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.hadoop.util.HadoopInputFile; import org.apache.parquet.hadoop.util.HiddenFileFilter; import org.apache.parquet.hadoop.util.HadoopStreams; import org.apache.parquet.io.SeekableInputStream; import org.apache.parquet.hadoop.util.counters.BenchmarkCounter; import org.apache.parquet.io.ParquetDecodingException; +import org.apache.parquet.io.InputFile; /** * Internal implementation of the Parquet file reader as a block container @@ -394,8 +396,7 @@ public class ParquetFileReader implements Closeable { * @throws IOException if an error occurs while reading the file */ public static ParquetMetadata readFooter(Configuration configuration, Path file, MetadataFilter filter) throws IOException { - FileSystem fileSystem = file.getFileSystem(configuration); - return readFooter(configuration, fileSystem.getFileStatus(file), filter); + return readFooter(HadoopInputFile.fromPath(file, configuration), filter); } /** @@ -415,12 +416,21 @@ public class ParquetFileReader implements Closeable { * @throws IOException if an error occurs while reading the file */ public static final ParquetMetadata readFooter(Configuration configuration, FileStatus file, MetadataFilter filter) throws IOException { - FileSystem fileSystem = file.getPath().getFileSystem(configuration); - SeekableInputStream in = HadoopStreams.wrap(fileSystem.open(file.getPath())); - try { - return readFooter(file.getLen(), file.getPath().toString(), in, filter); - } finally { - in.close(); + return readFooter(HadoopInputFile.fromStatus(file, configuration), filter); + } + + /** + * Reads the meta data block in the footer of the file using provided input stream + * @param file a {@link InputFile} to read + * @param filter the filter to apply to row groups + * @return the metadata blocks in the footer + * @throws IOException if an error occurs while reading the file + */ + public static final ParquetMetadata readFooter( + InputFile file, MetadataFilter filter) throws IOException { + try (SeekableInputStream in = file.newStream()) { + return readFooter(converter, file.getLength(), file.toString(), + in, filter); } } @@ -433,7 +443,7 @@ public class ParquetFileReader implements Closeable { * @return the metadata blocks in the footer * @throws IOException if an error occurs while reading the file */ - public static final ParquetMetadata readFooter(long fileLen, String filePath, SeekableInputStream f, MetadataFilter filter) throws IOException { + private static final ParquetMetadata readFooter(ParquetMetadataConverter converter, long fileLen, String filePath, SeekableInputStream f, MetadataFilter filter) throws IOException { if (Log.DEBUG) { LOG.debug("File length " + fileLen); } @@ -543,7 +553,7 @@ public class ParquetFileReader implements Closeable { FileSystem fs = file.getFileSystem(conf); this.fileStatus = fs.getFileStatus(file); this.f = HadoopStreams.wrap(fs.open(file)); - this.footer = readFooter(fileStatus.getLen(), fileStatus.getPath().toString(), f, filter); + this.footer = readFooter(converter, fileStatus.getLen(), fileStatus.getPath().toString(), f, filter); this.fileMetaData = footer.getFileMetaData(); this.blocks = footer.getBlocks(); for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) { @@ -580,7 +590,7 @@ public class ParquetFileReader implements Closeable { if (footer == null) { try { // don't read the row groups because this.blocks is always set - this.footer = readFooter(fileStatus.getLen(), fileStatus.getPath().toString(), f, SKIP_ROW_GROUPS); + this.footer = readFooter(converter, fileStatus.getLen(), fileStatus.getPath().toString(), f, SKIP_ROW_GROUPS); } catch (IOException e) { throw new ParquetDecodingException("Unable to read file footer", e); } http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/2990dea6/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/util/HadoopInputFile.java ---------------------------------------------------------------------- diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/util/HadoopInputFile.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/util/HadoopInputFile.java new file mode 100644 index 0000000..d5868d3 --- /dev/null +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/util/HadoopInputFile.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.hadoop.util; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.io.SeekableInputStream; +import org.apache.parquet.io.InputFile; +import java.io.IOException; + +public class HadoopInputFile implements InputFile { + + private final FileSystem fs; + private final FileStatus stat; + + public static HadoopInputFile fromPath(Path path, Configuration conf) + throws IOException { + FileSystem fs = path.getFileSystem(conf); + return new HadoopInputFile(fs, fs.getFileStatus(path)); + } + + public static HadoopInputFile fromStatus(FileStatus stat, Configuration conf) + throws IOException { + FileSystem fs = stat.getPath().getFileSystem(conf); + return new HadoopInputFile(fs, stat); + } + + private HadoopInputFile(FileSystem fs, FileStatus stat) { + this.fs = fs; + this.stat = stat; + } + + @Override + public long getLength() { + return stat.getLen(); + } + + @Override + public SeekableInputStream newStream() throws IOException { + return HadoopStreams.wrap(fs.open(stat.getPath())); + } + + @Override + public String toString() { + return stat.getPath().toString(); + } +}