parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From u..@apache.org
Subject [parquet-cpp] branch master updated: PARQUET-1348: Add ability to write FileMetaData in arrow FileWriter
Date Sat, 28 Jul 2018 15:21:29 GMT
This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-cpp.git


The following commit(s) were added to refs/heads/master by this push:
     new b4023c2  PARQUET-1348: Add ability to write FileMetaData in arrow FileWriter
b4023c2 is described below

commit b4023c21958c8d654ed304a1a0eb628317a2d170
Author: Robert Gruener <robbieg@uber.com>
AuthorDate: Sat Jul 28 17:21:11 2018 +0200

    PARQUET-1348: Add ability to write FileMetaData in arrow FileWriter
    
    I used static functions since there was no better way I could see
    
    Author: Robert Gruener <robbieg@uber.com>
    
    Closes #481 from rgruener/write-metadata and squashes the following commits:
    
    bdd5ec1 [Robert Gruener] PARQUET-1348: Add ability to write FileMetaData in arrow FileWriter
---
 src/parquet/arrow/arrow-reader-writer-test.cc | 32 +++++++++++++++++++++++
 src/parquet/arrow/writer.cc                   | 13 ++++++++++
 src/parquet/arrow/writer.h                    |  8 ++++++
 src/parquet/file_writer.cc                    | 37 +++++++++++++++++++--------
 src/parquet/file_writer.h                     |  8 ++++++
 5 files changed, 88 insertions(+), 10 deletions(-)

diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc b/src/parquet/arrow/arrow-reader-writer-test.cc
index 1c2f322..02b8d52 100644
--- a/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -851,6 +851,38 @@ TYPED_TEST(TestParquetIO, SingleColumnTableOptionalChunkedWrite) {
   ASSERT_NO_FATAL_FAILURE(this->ReadAndCheckSingleColumnTable(values));
 }
 
+TYPED_TEST(TestParquetIO, FileMetaDataWrite) {
+  std::shared_ptr<Array> values;
+  ASSERT_OK(NonNullArray<TypeParam>(SMALL_SIZE, &values));
+  std::shared_ptr<Table> table = MakeSimpleTable(values, false);
+  this->sink_ = std::make_shared<InMemoryOutputStream>();
+  ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), this->sink_,
+                                values->length(), default_writer_properties()));
+
+  std::unique_ptr<FileReader> reader;
+  ASSERT_NO_FATAL_FAILURE(this->ReaderFromSink(&reader));
+  const std::shared_ptr<FileMetaData> fileMetaData = reader->parquet_reader()->metadata();
+  ASSERT_EQ(1, fileMetaData->num_columns());
+  ASSERT_EQ(100, fileMetaData->num_rows());
+
+  this->sink_ = std::make_shared<InMemoryOutputStream>();
+
+  std::unique_ptr<FileMetaData> uniqueFileMetaData(fileMetaData.get());
+
+  ASSERT_OK_NO_THROW(FileWriter::WriteMetaData(uniqueFileMetaData, this->sink_));
+
+  ASSERT_NO_FATAL_FAILURE(this->ReaderFromSink(&reader));
+  const std::shared_ptr<FileMetaData> fileMetaDataWritten =
+          reader->parquet_reader()->metadata();
+  ASSERT_EQ(fileMetaData->size(), fileMetaDataWritten->size());
+  ASSERT_EQ(fileMetaData->num_row_groups(), fileMetaDataWritten->num_row_groups());
+  ASSERT_EQ(fileMetaData->num_rows(), fileMetaDataWritten->num_rows());
+  ASSERT_EQ(fileMetaData->num_columns(), fileMetaDataWritten->num_columns());
+  ASSERT_EQ(fileMetaData->RowGroup(0)->num_rows(),
+            fileMetaDataWritten->RowGroup(0)->num_rows());
+  uniqueFileMetaData.release();
+}
+
 using TestInt96ParquetIO = TestParquetIO<::arrow::TimestampType>;
 
 TEST_F(TestInt96ParquetIO, ReadIntoTimestamp) {
diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc
index f3ddda9..d1697c3 100644
--- a/src/parquet/arrow/writer.cc
+++ b/src/parquet/arrow/writer.cc
@@ -1092,6 +1092,19 @@ Status FileWriter::Open(const ::arrow::Schema& schema, ::arrow::MemoryPool*
pool
   return Open(schema, pool, wrapper, properties, arrow_properties, writer);
 }
 
+Status FileWriter::WriteMetaData(const std::unique_ptr<FileMetaData>& fileMetaData,
+                                 const std::shared_ptr<OutputStream>& sink) {
+  ParquetFileWriter::WriteMetaData(sink, fileMetaData);
+  return Status::OK();
+}
+
+Status FileWriter::WriteMetaData(const std::unique_ptr<FileMetaData>& fileMetaData,
+                                 const std::shared_ptr<::arrow::io::OutputStream>&
sink) {
+  auto wrapper = std::make_shared<ArrowOutputStream>(sink);
+  return WriteMetaData(fileMetaData, wrapper);
+}
+
+
 namespace {}  // namespace
 
 Status FileWriter::WriteTable(const Table& table, int64_t chunk_size) {
diff --git a/src/parquet/arrow/writer.h b/src/parquet/arrow/writer.h
index 06008d2..d62d3b0 100644
--- a/src/parquet/arrow/writer.h
+++ b/src/parquet/arrow/writer.h
@@ -132,6 +132,14 @@ class PARQUET_EXPORT FileWriter {
       const std::shared_ptr<ArrowWriterProperties>& arrow_properties,
       std::unique_ptr<FileWriter>* writer);
 
+  static ::arrow::Status WriteMetaData(
+      const std::unique_ptr<FileMetaData>& fileMetaData,
+      const std::shared_ptr<OutputStream>& sink);
+
+  static ::arrow::Status WriteMetaData(
+      const std::unique_ptr<FileMetaData>& fileMetaData,
+      const std::shared_ptr<::arrow::io::OutputStream>& sink);
+
   /// \brief Write a Table to Parquet.
   ::arrow::Status WriteTable(const ::arrow::Table& table, int64_t chunk_size);
 
diff --git a/src/parquet/file_writer.cc b/src/parquet/file_writer.cc
index 1e4a09e..cc34fd0 100644
--- a/src/parquet/file_writer.cc
+++ b/src/parquet/file_writer.cc
@@ -160,6 +160,20 @@ class FileSerializer : public ParquetFileWriter::Contents {
     return result;
   }
 
+  static void WriteMetaData(
+      const std::shared_ptr<OutputStream>& sink,
+      const std::unique_ptr<FileMetaData>& fileMetaData) {
+    // Write MetaData
+    uint32_t metadata_len = static_cast<uint32_t>(sink->Tell());
+
+    fileMetaData->WriteTo(sink.get());
+    metadata_len = static_cast<uint32_t>(sink->Tell()) - metadata_len;
+
+    // Write Footer
+    sink->Write(reinterpret_cast<uint8_t*>(&metadata_len), 4);
+    sink->Write(PARQUET_MAGIC, 4);
+  }
+
   void Close() override {
     if (is_open_) {
       if (row_group_writer_) {
@@ -234,17 +248,8 @@ class FileSerializer : public ParquetFileWriter::Contents {
   }
 
   void WriteMetaData() {
-    // Write MetaData
-    uint32_t metadata_len = static_cast<uint32_t>(sink_->Tell());
-
-    // Get a FileMetaData
     auto metadata = metadata_->Finish();
-    metadata->WriteTo(sink_.get());
-    metadata_len = static_cast<uint32_t>(sink_->Tell()) - metadata_len;
-
-    // Write Footer
-    sink_->Write(reinterpret_cast<uint8_t*>(&metadata_len), 4);
-    sink_->Write(PARQUET_MAGIC, 4);
+    WriteMetaData(sink_, metadata);
   }
 };
 
@@ -280,6 +285,18 @@ std::unique_ptr<ParquetFileWriter> ParquetFileWriter::Open(
   return result;
 }
 
+void ParquetFileWriter::WriteMetaData(
+        const std::shared_ptr<::arrow::io::OutputStream> &sink,
+        const std::unique_ptr<FileMetaData> &fileMetaData) {
+    WriteMetaData(std::make_shared<ArrowOutputStream>(sink), fileMetaData);
+}
+
+void ParquetFileWriter::WriteMetaData(
+        const std::shared_ptr<OutputStream> &sink,
+        const std::unique_ptr<FileMetaData> &fileMetaData) {
+  FileSerializer::WriteMetaData(sink, fileMetaData);
+}
+
 const SchemaDescriptor* ParquetFileWriter::schema() const { return contents_->schema();
}
 
 const ColumnDescriptor* ParquetFileWriter::descr(int i) const {
diff --git a/src/parquet/file_writer.h b/src/parquet/file_writer.h
index 9c28531..e0d1dae 100644
--- a/src/parquet/file_writer.h
+++ b/src/parquet/file_writer.h
@@ -133,6 +133,14 @@ class PARQUET_EXPORT ParquetFileWriter {
       const std::shared_ptr<WriterProperties>& properties = default_writer_properties(),
       const std::shared_ptr<const KeyValueMetadata>& key_value_metadata = nullptr);
 
+  static void WriteMetaData(
+          const std::shared_ptr<::arrow::io::OutputStream> &sink,
+          const std::unique_ptr<FileMetaData> &fileMetaData);
+
+  static void WriteMetaData(
+          const std::shared_ptr<OutputStream> &sink,
+          const std::unique_ptr<FileMetaData> &fileMetaData);
+
   void Open(std::unique_ptr<Contents> contents);
   void Close();
 


Mime
View raw message