Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 620AD200B9F for ; Tue, 11 Oct 2016 17:18:11 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id 609C9160AE6; Tue, 11 Oct 2016 15:18:11 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 09EF0160AD2 for ; Tue, 11 Oct 2016 17:18:09 +0200 (CEST) Received: (qmail 44680 invoked by uid 500); 11 Oct 2016 15:18:09 -0000 Mailing-List: contact commits-help@parquet.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@parquet.apache.org Delivered-To: mailing list commits@parquet.apache.org Received: (qmail 44670 invoked by uid 99); 11 Oct 2016 15:18:09 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 11 Oct 2016 15:18:09 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id F31BADFDC4; Tue, 11 Oct 2016 15:18:08 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: wesm@apache.org To: commits@parquet.apache.org Message-Id: <2588b1ae9bd54dbca8b2f4a7c70e5883@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: parquet-cpp git commit: PARQUET-747: Better hide TypedRowGroupStatistics in public API Date: Tue, 11 Oct 2016 15:18:08 +0000 (UTC) archived-at: Tue, 11 Oct 2016 15:18:11 -0000 Repository: parquet-cpp Updated Branches: refs/heads/master a26ed0f17 -> 78ed6e8ce PARQUET-747: Better hide TypedRowGroupStatistics in public API EDIT: It appears the linker issue is coming from the use of `TypedRowGroupStatistics::Update` in `WriteBatch`, and Update hasn't been exported. There's some gcc quirks around this (see https://codereview.chromium.org/643703003/), so it's easier to hide the WriteBatch implementation so that `libparquet_arrow` doesn't look for non-exported symbols during linking (since `WriteBatch` was being inlined). Template subclasses don't behave in the same way with visibility attributes. See also PARQUET-659. Observed in build failure https://circleci.com/gh/conda-forge/staged-recipes/10100 Author: Wes McKinney Closes #178 from wesm/PARQUET-747 and squashes the following commits: 8e91f86 [Wes McKinney] Do not expose WriteBatch implementation in public headers 206f4e1 [Wes McKinney] clang format ed5132d [Wes McKinney] Export single instantiations of TypedRowGroupStatistics with extern templates Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/78ed6e8c Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/78ed6e8c Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/78ed6e8c Branch: refs/heads/master Commit: 78ed6e8cebfabd28567a6feaf82d736bce5316d2 Parents: a26ed0f Author: Wes McKinney Authored: Tue Oct 11 11:18:01 2016 -0400 Committer: Wes McKinney Committed: Tue Oct 11 11:18:01 2016 -0400 ---------------------------------------------------------------------- src/parquet/arrow/arrow-reader-writer-test.cc | 7 +- src/parquet/column/column-writer-test.cc | 3 +- src/parquet/column/scanner-test.cc | 3 +- src/parquet/column/statistics.cc | 26 +------ src/parquet/column/statistics.h | 38 +++++++++- src/parquet/column/writer.cc | 80 +++++++++++++++++++++ src/parquet/column/writer.h | 82 +--------------------- src/parquet/encodings/encoding-test.cc | 3 +- src/parquet/file/file-serialize-test.cc | 3 +- src/parquet/thrift/util.h | 6 +- src/parquet/util/comparison.h | 2 +- src/parquet/util/test-common.h | 3 +- 12 files changed, 137 insertions(+), 119 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/78ed6e8c/src/parquet/arrow/arrow-reader-writer-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc b/src/parquet/arrow/arrow-reader-writer-test.cc index 11ebee0..b1f1c52 100644 --- a/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/src/parquet/arrow/arrow-reader-writer-test.cc @@ -258,7 +258,8 @@ class TestParquetIO : public ::testing::Test { typedef ::testing::Types<::arrow::BooleanType, ::arrow::UInt8Type, ::arrow::Int8Type, ::arrow::UInt16Type, ::arrow::Int16Type, ::arrow::Int32Type, ::arrow::UInt64Type, ::arrow::Int64Type, ::arrow::TimestampType, ::arrow::FloatType, ::arrow::DoubleType, - ::arrow::StringType> TestTypes; + ::arrow::StringType> + TestTypes; TYPED_TEST_CASE(TestParquetIO, TestTypes); @@ -476,8 +477,8 @@ class TestPrimitiveParquetIO : public TestParquetIO { typedef ::testing::Types<::arrow::BooleanType, ::arrow::UInt8Type, ::arrow::Int8Type, ::arrow::UInt16Type, ::arrow::Int16Type, ::arrow::UInt32Type, ::arrow::Int32Type, - ::arrow::UInt64Type, ::arrow::Int64Type, ::arrow::FloatType, - ::arrow::DoubleType> PrimitiveTestTypes; + ::arrow::UInt64Type, ::arrow::Int64Type, ::arrow::FloatType, ::arrow::DoubleType> + PrimitiveTestTypes; TYPED_TEST_CASE(TestPrimitiveParquetIO, PrimitiveTestTypes); http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/78ed6e8c/src/parquet/column/column-writer-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/column/column-writer-test.cc b/src/parquet/column/column-writer-test.cc index 9f04c06..745efe7 100644 --- a/src/parquet/column/column-writer-test.cc +++ b/src/parquet/column/column-writer-test.cc @@ -214,7 +214,8 @@ void TestPrimitiveWriter::ReadColumnFully(Compression::type compressio } typedef ::testing::Types TestTypes; + BooleanType, ByteArrayType, FLBAType> + TestTypes; TYPED_TEST_CASE(TestPrimitiveWriter, TestTypes); http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/78ed6e8c/src/parquet/column/scanner-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/column/scanner-test.cc b/src/parquet/column/scanner-test.cc index 3ac07dc..8eee191 100644 --- a/src/parquet/column/scanner-test.cc +++ b/src/parquet/column/scanner-test.cc @@ -146,7 +146,8 @@ static int num_pages = 20; static int batch_size = 32; typedef ::testing::Types TestTypes; + ByteArrayType> + TestTypes; using TestBooleanFlatScanner = TestFlatScanner; using TestFLBAFlatScanner = TestFlatScanner; http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/78ed6e8c/src/parquet/column/statistics.cc ---------------------------------------------------------------------- diff --git a/src/parquet/column/statistics.cc b/src/parquet/column/statistics.cc index 65d7df5..d8f8785 100644 --- a/src/parquet/column/statistics.cc +++ b/src/parquet/column/statistics.cc @@ -20,10 +20,10 @@ #include "parquet/column/statistics.h" #include "parquet/encodings/plain-encoding.h" +#include "parquet/exception.h" #include "parquet/util/buffer.h" #include "parquet/util/comparison.h" #include "parquet/util/output.h" -#include "parquet/exception.h" namespace parquet { @@ -79,30 +79,6 @@ void TypedRowGroupStatistics::Reset() { } template -void TypedRowGroupStatistics::Copy(const T& src, T* dst, OwnedMutableBuffer&) { - *dst = src; -} - -template <> -void TypedRowGroupStatistics::Copy( - const FLBA& src, FLBA* dst, OwnedMutableBuffer& buffer) { - if (dst->ptr == src.ptr) return; - uint32_t len = descr_->type_length(); - buffer.Resize(len); - std::memcpy(&buffer[0], src.ptr, len); - *dst = FLBA(buffer.data()); -} - -template <> -void TypedRowGroupStatistics::Copy( - const ByteArray& src, ByteArray* dst, OwnedMutableBuffer& buffer) { - if (dst->ptr == src.ptr) return; - buffer.Resize(src.len); - std::memcpy(&buffer[0], src.ptr, src.len); - *dst = ByteArray(src.len, buffer.data()); -} - -template void TypedRowGroupStatistics::Update( const T* values, int64_t num_not_null, int64_t num_null) { DCHECK(num_not_null >= 0); http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/78ed6e8c/src/parquet/column/statistics.h ---------------------------------------------------------------------- diff --git a/src/parquet/column/statistics.h b/src/parquet/column/statistics.h index 02e5abd..dc576c8 100644 --- a/src/parquet/column/statistics.h +++ b/src/parquet/column/statistics.h @@ -22,8 +22,8 @@ #include #include -#include "parquet/types.h" #include "parquet/schema/descriptor.h" +#include "parquet/types.h" #include "parquet/util/buffer.h" #include "parquet/util/mem-allocator.h" #include "parquet/util/visibility.h" @@ -130,7 +130,7 @@ class PARQUET_EXPORT RowGroupStatistics }; template -class PARQUET_EXPORT TypedRowGroupStatistics : public RowGroupStatistics { +class TypedRowGroupStatistics : public RowGroupStatistics { public: using T = typename DType::c_type; @@ -170,6 +170,31 @@ class PARQUET_EXPORT TypedRowGroupStatistics : public RowGroupStatistics { OwnedMutableBuffer min_buffer_, max_buffer_; }; +template +inline void TypedRowGroupStatistics::Copy( + const T& src, T* dst, OwnedMutableBuffer&) { + *dst = src; +} + +template <> +inline void TypedRowGroupStatistics::Copy( + const FLBA& src, FLBA* dst, OwnedMutableBuffer& buffer) { + if (dst->ptr == src.ptr) return; + uint32_t len = descr_->type_length(); + buffer.Resize(len); + std::memcpy(&buffer[0], src.ptr, len); + *dst = FLBA(buffer.data()); +} + +template <> +inline void TypedRowGroupStatistics::Copy( + const ByteArray& src, ByteArray* dst, OwnedMutableBuffer& buffer) { + if (dst->ptr == src.ptr) return; + buffer.Resize(src.len); + std::memcpy(&buffer[0], src.ptr, src.len); + *dst = ByteArray(src.len, buffer.data()); +} + using BoolStatistics = TypedRowGroupStatistics; using Int32Statistics = TypedRowGroupStatistics; using Int64Statistics = TypedRowGroupStatistics; @@ -179,6 +204,15 @@ using DoubleStatistics = TypedRowGroupStatistics; using ByteArrayStatistics = TypedRowGroupStatistics; using FLBAStatistics = TypedRowGroupStatistics; +extern template class TypedRowGroupStatistics; +extern template class TypedRowGroupStatistics; +extern template class TypedRowGroupStatistics; +extern template class TypedRowGroupStatistics; +extern template class TypedRowGroupStatistics; +extern template class TypedRowGroupStatistics; +extern template class TypedRowGroupStatistics; +extern template class TypedRowGroupStatistics; + } // namespace parquet #endif // PARQUET_COLUMN_STATISTICS_H http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/78ed6e8c/src/parquet/column/writer.cc ---------------------------------------------------------------------- diff --git a/src/parquet/column/writer.cc b/src/parquet/column/writer.cc index 5c10ab9..b917945 100644 --- a/src/parquet/column/writer.cc +++ b/src/parquet/column/writer.cc @@ -301,6 +301,86 @@ std::shared_ptr ColumnWriter::Make(ColumnChunkMetaDataBuilder* met // ---------------------------------------------------------------------- // Instantiate templated classes +template +inline int64_t TypedColumnWriter::WriteMiniBatch(int64_t num_values, + const int16_t* def_levels, const int16_t* rep_levels, const T* values) { + int64_t values_to_write = 0; + // If the field is required and non-repeated, there are no definition levels + if (descr_->max_definition_level() > 0) { + for (int64_t i = 0; i < num_values; ++i) { + if (def_levels[i] == descr_->max_definition_level()) { ++values_to_write; } + } + + WriteDefinitionLevels(num_values, def_levels); + } else { + // Required field, write all values + values_to_write = num_values; + } + + // Not present for non-repeated fields + if (descr_->max_repetition_level() > 0) { + // A row could include more than one value + // Count the occasions where we start a new row + for (int64_t i = 0; i < num_values; ++i) { + if (rep_levels[i] == 0) { num_rows_++; } + } + + WriteRepetitionLevels(num_values, rep_levels); + } else { + // Each value is exactly one row + num_rows_ += num_values; + } + + if (num_rows_ > expected_rows_) { + throw ParquetException("More rows were written in the column chunk than expected"); + } + + WriteValues(values_to_write, values); + + if (page_statistics_ != nullptr) { + page_statistics_->Update(values, values_to_write, num_values - values_to_write); + } + + num_buffered_values_ += num_values; + num_buffered_encoded_values_ += values_to_write; + + if (current_encoder_->EstimatedDataEncodedSize() >= properties_->data_pagesize()) { + AddDataPage(); + } + if (has_dictionary_ && !fallback_) { CheckDictionarySizeLimit(); } + + return values_to_write; +} + +template +void TypedColumnWriter::WriteBatch(int64_t num_values, + const int16_t* def_levels, const int16_t* rep_levels, const T* values) { + // We check for DataPage limits only after we have inserted the values. If a user + // writes a large number of values, the DataPage size can be much above the limit. + // The purpose of this chunking is to bound this. Even if a user writes large number + // of values, the chunking will ensure the AddDataPage() is called at a reasonable + // pagesize limit + int64_t write_batch_size = properties_->write_batch_size(); + int num_batches = num_values / write_batch_size; + int64_t num_remaining = num_values % write_batch_size; + int64_t value_offset = 0; + for (int round = 0; round < num_batches; round++) { + int64_t offset = round * write_batch_size; + int64_t num_values = WriteMiniBatch(write_batch_size, &def_levels[offset], + &rep_levels[offset], &values[value_offset]); + value_offset += num_values; + } + // Write the remaining values + int64_t offset = num_batches * write_batch_size; + WriteMiniBatch( + num_remaining, &def_levels[offset], &rep_levels[offset], &values[value_offset]); +} + +template +void TypedColumnWriter::WriteValues(int64_t num_values, const T* values) { + current_encoder_->Put(values, num_values); +} + template class TypedColumnWriter; template class TypedColumnWriter; template class TypedColumnWriter; http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/78ed6e8c/src/parquet/column/writer.h ---------------------------------------------------------------------- diff --git a/src/parquet/column/writer.h b/src/parquet/column/writer.h index ab01818..67a29bc 100644 --- a/src/parquet/column/writer.h +++ b/src/parquet/column/writer.h @@ -24,8 +24,8 @@ #include "parquet/column/page.h" #include "parquet/column/properties.h" #include "parquet/column/statistics.h" -#include "parquet/file/metadata.h" #include "parquet/encodings/encoder.h" +#include "parquet/file/metadata.h" #include "parquet/schema/descriptor.h" #include "parquet/types.h" #include "parquet/util/mem-allocator.h" @@ -186,86 +186,6 @@ class PARQUET_EXPORT TypedColumnWriter : public ColumnWriter { std::unique_ptr chunk_statistics_; }; -template -inline int64_t TypedColumnWriter::WriteMiniBatch(int64_t num_values, - const int16_t* def_levels, const int16_t* rep_levels, const T* values) { - int64_t values_to_write = 0; - // If the field is required and non-repeated, there are no definition levels - if (descr_->max_definition_level() > 0) { - for (int64_t i = 0; i < num_values; ++i) { - if (def_levels[i] == descr_->max_definition_level()) { ++values_to_write; } - } - - WriteDefinitionLevels(num_values, def_levels); - } else { - // Required field, write all values - values_to_write = num_values; - } - - // Not present for non-repeated fields - if (descr_->max_repetition_level() > 0) { - // A row could include more than one value - // Count the occasions where we start a new row - for (int64_t i = 0; i < num_values; ++i) { - if (rep_levels[i] == 0) { num_rows_++; } - } - - WriteRepetitionLevels(num_values, rep_levels); - } else { - // Each value is exactly one row - num_rows_ += num_values; - } - - if (num_rows_ > expected_rows_) { - throw ParquetException("More rows were written in the column chunk than expected"); - } - - WriteValues(values_to_write, values); - - if (page_statistics_ != nullptr) { - page_statistics_->Update(values, values_to_write, num_values - values_to_write); - } - - num_buffered_values_ += num_values; - num_buffered_encoded_values_ += values_to_write; - - if (current_encoder_->EstimatedDataEncodedSize() >= properties_->data_pagesize()) { - AddDataPage(); - } - if (has_dictionary_ && !fallback_) { CheckDictionarySizeLimit(); } - - return values_to_write; -} - -template -inline void TypedColumnWriter::WriteBatch(int64_t num_values, - const int16_t* def_levels, const int16_t* rep_levels, const T* values) { - // We check for DataPage limits only after we have inserted the values. If a user - // writes a large number of values, the DataPage size can be much above the limit. - // The purpose of this chunking is to bound this. Even if a user writes large number - // of values, the chunking will ensure the AddDataPage() is called at a reasonable - // pagesize limit - int64_t write_batch_size = properties_->write_batch_size(); - int num_batches = num_values / write_batch_size; - int64_t num_remaining = num_values % write_batch_size; - int64_t value_offset = 0; - for (int round = 0; round < num_batches; round++) { - int64_t offset = round * write_batch_size; - int64_t num_values = WriteMiniBatch(write_batch_size, &def_levels[offset], - &rep_levels[offset], &values[value_offset]); - value_offset += num_values; - } - // Write the remaining values - int64_t offset = num_batches * write_batch_size; - WriteMiniBatch( - num_remaining, &def_levels[offset], &rep_levels[offset], &values[value_offset]); -} - -template -void TypedColumnWriter::WriteValues(int64_t num_values, const T* values) { - current_encoder_->Put(values, num_values); -} - typedef TypedColumnWriter BoolWriter; typedef TypedColumnWriter Int32Writer; typedef TypedColumnWriter Int64Writer; http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/78ed6e8c/src/parquet/encodings/encoding-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/encodings/encoding-test.cc b/src/parquet/encodings/encoding-test.cc index 6af7c9d..daa25cb 100644 --- a/src/parquet/encodings/encoding-test.cc +++ b/src/parquet/encodings/encoding-test.cc @@ -238,7 +238,8 @@ TYPED_TEST(TestPlainEncoding, BasicRoundTrip) { // Dictionary encoding tests typedef ::testing::Types DictEncodedTypes; + ByteArrayType, FLBAType> + DictEncodedTypes; template class TestDictionaryEncoding : public TestEncodingBase { http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/78ed6e8c/src/parquet/file/file-serialize-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/file/file-serialize-test.cc b/src/parquet/file/file-serialize-test.cc index 5980e55..42a73c9 100644 --- a/src/parquet/file/file-serialize-test.cc +++ b/src/parquet/file/file-serialize-test.cc @@ -106,7 +106,8 @@ class TestSerialize : public PrimitiveTypedTest { }; typedef ::testing::Types TestTypes; + BooleanType, ByteArrayType, FLBAType> + TestTypes; TYPED_TEST_CASE(TestSerialize, TestTypes); http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/78ed6e8c/src/parquet/thrift/util.h ---------------------------------------------------------------------- diff --git a/src/parquet/thrift/util.h b/src/parquet/thrift/util.h index f629689..a340c6c 100644 --- a/src/parquet/thrift/util.h +++ b/src/parquet/thrift/util.h @@ -82,7 +82,8 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali boost::shared_ptr tmem_transport( new apache::thrift::transport::TMemoryBuffer(const_cast(buf), *len)); apache::thrift::protocol::TCompactProtocolFactoryT< - apache::thrift::transport::TMemoryBuffer> tproto_factory; + apache::thrift::transport::TMemoryBuffer> + tproto_factory; boost::shared_ptr tproto = tproto_factory.getProtocol(tmem_transport); try { @@ -104,7 +105,8 @@ inline void SerializeThriftMsg(T* obj, uint32_t len, OutputStream* out) { boost::shared_ptr mem_buffer( new apache::thrift::transport::TMemoryBuffer(len)); apache::thrift::protocol::TCompactProtocolFactoryT< - apache::thrift::transport::TMemoryBuffer> tproto_factory; + apache::thrift::transport::TMemoryBuffer> + tproto_factory; boost::shared_ptr tproto = tproto_factory.getProtocol(mem_buffer); try { http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/78ed6e8c/src/parquet/util/comparison.h ---------------------------------------------------------------------- diff --git a/src/parquet/util/comparison.h b/src/parquet/util/comparison.h index 9d44e7e..5ca7520 100644 --- a/src/parquet/util/comparison.h +++ b/src/parquet/util/comparison.h @@ -20,8 +20,8 @@ #include -#include "parquet/types.h" #include "parquet/schema/descriptor.h" +#include "parquet/types.h" namespace parquet { http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/78ed6e8c/src/parquet/util/test-common.h ---------------------------------------------------------------------- diff --git a/src/parquet/util/test-common.h b/src/parquet/util/test-common.h index edadb53..2327aeb 100644 --- a/src/parquet/util/test-common.h +++ b/src/parquet/util/test-common.h @@ -32,7 +32,8 @@ namespace parquet { namespace test { typedef ::testing::Types ParquetTypes; + DoubleType, ByteArrayType, FLBAType> + ParquetTypes; template static inline void assert_vector_equal(const vector& left, const vector& right) {