parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject parquet-cpp git commit: PARQUET-1087: Add ScanContents function to arrow::FileReader that catches Parquet exceptions
Date Tue, 05 Sep 2017 19:44:46 GMT
Repository: parquet-cpp
Updated Branches:
  refs/heads/master a0c349b6f -> bc46b1436


PARQUET-1087: Add ScanContents function to arrow::FileReader that catches Parquet exceptions

Also fixes a bug in ScanFileContents re: number of rows returned

Author: Wes McKinney <wes.mckinney@twosigma.com>

Closes #387 from wesm/PARQUET-1087 and squashes the following commits:

e555e78 [Wes McKinney] Add ScanContents function to arrow::FileReader


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/bc46b143
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/bc46b143
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/bc46b143

Branch: refs/heads/master
Commit: bc46b1436508447f2335c48b343563e2e605f8d3
Parents: a0c349b
Author: Wes McKinney <wes.mckinney@twosigma.com>
Authored: Tue Sep 5 15:44:40 2017 -0400
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Tue Sep 5 15:44:40 2017 -0400

----------------------------------------------------------------------
 src/parquet/arrow/arrow-reader-writer-test.cc | 23 ++++++++++++++++++++++
 src/parquet/arrow/reader.cc                   | 12 +++++++++++
 src/parquet/arrow/reader.h                    |  4 ++++
 src/parquet/file/reader.cc                    |  3 +--
 4 files changed, 40 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/bc46b143/src/parquet/arrow/arrow-reader-writer-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc b/src/parquet/arrow/arrow-reader-writer-test.cc
index 2adda67..986adfc 100644
--- a/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -1243,6 +1243,29 @@ TEST(TestArrowReadWrite, ReadSingleRowGroup) {
   ASSERT_TRUE(table->Equals(*concatenated));
 }
 
+TEST(TestArrowReadWrite, ScanContents) {
+  const int num_columns = 20;
+  const int num_rows = 1000;
+
+  std::shared_ptr<Table> table;
+  MakeDoubleTable(num_columns, num_rows, 1, &table);
+
+  std::shared_ptr<Buffer> buffer;
+  WriteTableToBuffer(table, 1, num_rows / 2, default_arrow_writer_properties(), &buffer);
+
+  std::unique_ptr<FileReader> reader;
+  ASSERT_OK_NO_THROW(OpenFile(std::make_shared<BufferReader>(buffer),
+                              ::arrow::default_memory_pool(),
+                              ::parquet::default_reader_properties(), nullptr, &reader));
+
+  int64_t num_rows_returned = 0;
+  ASSERT_OK_NO_THROW(reader->ScanContents({}, 256, &num_rows_returned));
+  ASSERT_EQ(num_rows, num_rows_returned);
+
+  ASSERT_OK_NO_THROW(reader->ScanContents({0, 1, 2}, 256, &num_rows_returned));
+  ASSERT_EQ(num_rows, num_rows_returned);
+}
+
 TEST(TestArrowReadWrite, ReadColumnSubset) {
   const int num_columns = 20;
   const int num_rows = 1000;

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/bc46b143/src/parquet/arrow/reader.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/reader.cc b/src/parquet/arrow/reader.cc
index ead2780..9908280 100644
--- a/src/parquet/arrow/reader.cc
+++ b/src/parquet/arrow/reader.cc
@@ -214,6 +214,8 @@ class FileReader::Impl {
 
   void set_num_threads(int num_threads) { num_threads_ = num_threads; }
 
+  ParquetFileReader* reader() { return reader_.get(); }
+
  private:
   MemoryPool* pool_;
   std::unique_ptr<ParquetFileReader> reader_;
@@ -626,6 +628,16 @@ int FileReader::num_row_groups() const { return impl_->num_row_groups();
}
 
 void FileReader::set_num_threads(int num_threads) { impl_->set_num_threads(num_threads);
}
 
+Status FileReader::ScanContents(std::vector<int> columns, const int32_t column_batch_size,
+                                int64_t* num_rows) {
+  try {
+    *num_rows = ScanFileContents(columns, column_batch_size, impl_->reader());
+    return Status::OK();
+  } catch (const ::parquet::ParquetException& e) {
+    return Status::IOError(e.what());
+  }
+}
+
 const ParquetFileReader* FileReader::parquet_reader() const {
   return impl_->parquet_reader();
 }

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/bc46b143/src/parquet/arrow/reader.h
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/reader.h b/src/parquet/arrow/reader.h
index f9688fb..ce82375 100644
--- a/src/parquet/arrow/reader.h
+++ b/src/parquet/arrow/reader.h
@@ -146,6 +146,10 @@ class PARQUET_EXPORT FileReader {
 
   ::arrow::Status ReadRowGroup(int i, std::shared_ptr<::arrow::Table>* out);
 
+  /// \brief Scan file contents with one thread, return number of rows
+  ::arrow::Status ScanContents(std::vector<int> columns, const int32_t column_batch_size,
+                               int64_t* num_rows);
+
   int num_row_groups() const;
 
   const ParquetFileReader* parquet_reader() const;

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/bc46b143/src/parquet/file/reader.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/reader.cc b/src/parquet/file/reader.cc
index c27fa4d..26876fc 100644
--- a/src/parquet/file/reader.cc
+++ b/src/parquet/file/reader.cc
@@ -153,13 +153,12 @@ int64_t ScanFileContents(std::vector<int> columns, const int32_t
column_batch_si
     }
   }
 
-  std::vector<int64_t> total_rows(num_columns);
+  std::vector<int64_t> total_rows(num_columns, 0);
 
   for (int r = 0; r < reader->metadata()->num_row_groups(); ++r) {
     auto group_reader = reader->RowGroup(r);
     int col = 0;
     for (auto i : columns) {
-      total_rows[col] = 0;
       std::shared_ptr<ColumnReader> col_reader = group_reader->Column(i);
       size_t value_byte_size = GetTypeByteSize(col_reader->descr()->physical_type());
       std::vector<uint8_t> values(column_batch_size * value_byte_size);


Mime
View raw message