From commits-return-1366-archive-asf-public=cust-asf.ponee.io@parquet.apache.org Wed Aug 1 21:14:28 2018 Return-Path: X-Original-To: archive-asf-public@cust-asf.ponee.io Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by mx-eu-01.ponee.io (Postfix) with SMTP id 90A5B180634 for ; Wed, 1 Aug 2018 21:14:27 +0200 (CEST) Received: (qmail 87506 invoked by uid 500); 1 Aug 2018 19:14:26 -0000 Mailing-List: contact commits-help@parquet.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@parquet.apache.org Delivered-To: mailing list commits@parquet.apache.org Received: (qmail 87497 invoked by uid 99); 1 Aug 2018 19:14:26 -0000 Received: from ec2-52-202-80-70.compute-1.amazonaws.com (HELO gitbox.apache.org) (52.202.80.70) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 01 Aug 2018 19:14:26 +0000 Received: by gitbox.apache.org (ASF Mail Server at gitbox.apache.org, from userid 33) id 1A33D82220; Wed, 1 Aug 2018 19:14:26 +0000 (UTC) Date: Wed, 01 Aug 2018 19:14:26 +0000 To: "commits@parquet.apache.org" Subject: [parquet-cpp] branch master updated: PARQUET-1366: [C++] Streamline use of Arrow's bit-util.h APIs MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit Message-ID: <153315086600.22270.14340893160599169599@gitbox.apache.org> From: wesm@apache.org X-Git-Host: gitbox.apache.org X-Git-Repo: parquet-cpp X-Git-Refname: refs/heads/master X-Git-Reftype: branch X-Git-Oldrev: 646e2258172112036e3c4c2e6541b0f86b5fb35f X-Git-Newrev: a0d1669cf67b055cd7b724dea04886a0ded53c8f X-Git-Rev: a0d1669cf67b055cd7b724dea04886a0ded53c8f X-Git-NotificationType: ref_changed_plus_diff X-Git-Multimail-Version: 1.5.dev Auto-Submitted: auto-generated This is an automated email from the ASF dual-hosted git repository. wesm pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/parquet-cpp.git The following commit(s) were added to refs/heads/master by this push: new a0d1669 PARQUET-1366: [C++] Streamline use of Arrow's bit-util.h APIs a0d1669 is described below commit a0d1669cf67b055cd7b724dea04886a0ded53c8f Author: Antoine Pitrou AuthorDate: Wed Aug 1 15:14:15 2018 -0400 PARQUET-1366: [C++] Streamline use of Arrow's bit-util.h APIs This is required before we can remove some duplicate or little-used APIs in ARROW-2950. Author: Antoine Pitrou Closes #483 from pitrou/PARQUET-1366-arrow-bit-util and squashes the following commits: 1eb6ef0 [Antoine Pitrou] Avoid using FirstTimeBitmapWriter for now (this Arrow API is too recent) 57aa82c [Antoine Pitrou] Fix line size 86c4ca5 [Antoine Pitrou] PARQUET-1366: [C++] Streamline use of Arrow bit-util.h --- src/parquet/arrow/test-util.h | 2 +- src/parquet/column_reader.cc | 2 +- src/parquet/column_writer-test.cc | 2 +- src/parquet/column_writer.cc | 5 +++-- src/parquet/encoding-internal.h | 7 ++++++- src/parquet/encoding-test.cc | 4 ++-- src/parquet/statistics-test.cc | 4 ++-- 7 files changed, 16 insertions(+), 10 deletions(-) diff --git a/src/parquet/arrow/test-util.h b/src/parquet/arrow/test-util.h index bfc78c8..2babacb 100644 --- a/src/parquet/arrow/test-util.h +++ b/src/parquet/arrow/test-util.h @@ -368,7 +368,7 @@ Status MakeListArray(const std::shared_ptr& values, int64_t size, int32_t* offsets_ptr = reinterpret_cast(offsets->mutable_data()); auto null_bitmap = AllocateBuffer(); - int64_t bitmap_size = ::arrow::BitUtil::CeilByte(size) / 8; + int64_t bitmap_size = ::arrow::BitUtil::BytesForBits(size); RETURN_NOT_OK(null_bitmap->Resize(bitmap_size)); uint8_t* null_bitmap_ptr = null_bitmap->mutable_data(); memset(null_bitmap_ptr, 0, bitmap_size); diff --git a/src/parquet/column_reader.cc b/src/parquet/column_reader.cc index bc3ee8a..28d0dcb 100644 --- a/src/parquet/column_reader.cc +++ b/src/parquet/column_reader.cc @@ -60,7 +60,7 @@ int LevelDecoder::SetData(Encoding::type encoding, int16_t max_level, } case Encoding::BIT_PACKED: { num_bytes = - static_cast(BitUtil::Ceil(num_buffered_values * bit_width_, 8)); + static_cast(BitUtil::BytesForBits(num_buffered_values * bit_width_)); if (!bit_packed_decoder_) { bit_packed_decoder_.reset(new ::arrow::BitReader(data, num_bytes)); } else { diff --git a/src/parquet/column_writer-test.cc b/src/parquet/column_writer-test.cc index aac582a..6c0794a 100644 --- a/src/parquet/column_writer-test.cc +++ b/src/parquet/column_writer-test.cc @@ -137,7 +137,7 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { bool enable_dictionary, bool enable_statistics, int64_t num_rows) { std::vector valid_bits( - BitUtil::RoundUpNumBytes(static_cast(this->values_.size())) + 1, 255); + BitUtil::BytesForBits(static_cast(this->values_.size())) + 1, 255); ColumnProperties column_properties(encoding, compression, enable_dictionary, enable_statistics); std::shared_ptr> writer = diff --git a/src/parquet/column_writer.cc b/src/parquet/column_writer.cc index 7d47d3f..48fba55 100644 --- a/src/parquet/column_writer.cc +++ b/src/parquet/column_writer.cc @@ -50,7 +50,7 @@ void LevelEncoder::Init(Encoding::type encoding, int16_t max_level, } case Encoding::BIT_PACKED: { int num_bytes = - static_cast(BitUtil::Ceil(num_buffered_values * bit_width_, 8)); + static_cast(BitUtil::BytesForBits(num_buffered_values * bit_width_)); bit_packed_encoder_.reset(new BitWriter(data, num_bytes)); break; } @@ -72,7 +72,8 @@ int LevelEncoder::MaxBufferSize(Encoding::type encoding, int16_t max_level, break; } case Encoding::BIT_PACKED: { - num_bytes = static_cast(BitUtil::Ceil(num_buffered_values * bit_width, 8)); + num_bytes = + static_cast(BitUtil::BytesForBits(num_buffered_values * bit_width)); break; } default: diff --git a/src/parquet/encoding-internal.h b/src/parquet/encoding-internal.h index 98f9e4a..2dfb9ff 100644 --- a/src/parquet/encoding-internal.h +++ b/src/parquet/encoding-internal.h @@ -151,12 +151,17 @@ class PlainDecoder : public Decoder { int Decode(uint8_t* buffer, int max_values) { max_values = std::min(max_values, num_values_); bool val; + ::arrow::internal::BitmapWriter bit_writer(buffer, 0, max_values); for (int i = 0; i < max_values; ++i) { if (!bit_reader_.GetValue(1, &val)) { ParquetException::EofException(); } - BitUtil::SetArrayBit(buffer, i, val); + if (val) { + bit_writer.Set(); + } + bit_writer.Next(); } + bit_writer.Finish(); num_values_ -= max_values; return max_values; } diff --git a/src/parquet/encoding-test.cc b/src/parquet/encoding-test.cc index 60285ab..50e1394 100644 --- a/src/parquet/encoding-test.cc +++ b/src/parquet/encoding-test.cc @@ -43,7 +43,7 @@ namespace test { TEST(VectorBooleanTest, TestEncodeDecode) { // PARQUET-454 int nvalues = 10000; - int nbytes = static_cast(BitUtil::Ceil(nvalues, 8)); + int nbytes = static_cast(BitUtil::BytesForBits(nvalues)); // seed the prng so failure is deterministic vector draws = flip_coins_seed(nvalues, 0.5, 0); @@ -252,7 +252,7 @@ class TestDictionaryEncoding : public TestEncodingBase { static constexpr int TYPE = Type::type_num; void CheckRoundtrip() { - std::vector valid_bits(BitUtil::RoundUpNumBytes(num_values_) + 1, 255); + std::vector valid_bits(BitUtil::BytesForBits(num_values_) + 1, 255); DictEncoder encoder(descr_.get(), &pool_); ASSERT_NO_THROW(encoder.Put(draws_, num_values_)); diff --git a/src/parquet/statistics-test.cc b/src/parquet/statistics-test.cc index 943d5cc..d2ecede 100644 --- a/src/parquet/statistics-test.cc +++ b/src/parquet/statistics-test.cc @@ -72,7 +72,7 @@ class TestRowGroupStatistics : public PrimitiveTypedTest { TypedStats statistics3(this->schema_.Column(0)); std::vector valid_bits( - BitUtil::RoundUpNumBytes(static_cast(this->values_.size())) + 1, 255); + BitUtil::BytesForBits(static_cast(this->values_.size())) + 1, 255); statistics3.UpdateSpaced(this->values_ptr_, valid_bits.data(), 0, this->values_.size(), 0); std::string encoded_min_spaced = statistics3.EncodeMin(); @@ -722,7 +722,7 @@ TEST(TestStatisticsFloatNaN, NaNValuesSpaced) { for (int i = 0; i < NUM_VALUES; i++) { nan_values[i] = std::nanf(""); } - std::vector valid_bits(BitUtil::RoundUpNumBytes(NUM_VALUES) + 1, 255); + std::vector valid_bits(BitUtil::BytesForBits(NUM_VALUES) + 1, 255); // Test values TypedRowGroupStatistics nan_stats(&descr);