Repository: parquet-cpp
Updated Branches:
refs/heads/master 8a6eca9d1 -> 35c8eb54a
PARQUET-599: Better size estimation for levels
Still not an optimal size estimation but at least we will have always the required amount.
Author: Uwe L. Korn <uwelk@xhochy.com>
Closes #96 from xhochy/parquet-599 and squashes the following commits:
e8044b5 [Uwe L. Korn] PARQUET-599: Better size estimation for levels
Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/35c8eb54
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/35c8eb54
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/35c8eb54
Branch: refs/heads/master
Commit: 35c8eb54aadcc18057b25db0cc6fd22239dee908
Parents: 8a6eca9
Author: Uwe L. Korn <uwelk@xhochy.com>
Authored: Sun May 8 17:12:54 2016 -0700
Committer: Wes McKinney <wesm@apache.org>
Committed: Sun May 8 17:12:54 2016 -0700
----------------------------------------------------------------------
src/parquet/column/levels.h | 22 ++++++++++++++++++++++
src/parquet/column/writer.cc | 4 +++-
2 files changed, 25 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/35c8eb54/src/parquet/column/levels.h
----------------------------------------------------------------------
diff --git a/src/parquet/column/levels.h b/src/parquet/column/levels.h
index fd84ec9..20261df 100644
--- a/src/parquet/column/levels.h
+++ b/src/parquet/column/levels.h
@@ -31,6 +31,28 @@ class LevelEncoder {
public:
LevelEncoder() {}
+ static int MaxBufferSize(
+ Encoding::type encoding, int16_t max_level, int num_buffered_values) {
+ int bit_width = BitUtil::Log2(max_level + 1);
+ int num_bytes = 0;
+ switch (encoding) {
+ case Encoding::RLE: {
+ // TODO: Due to the way we currently check if the buffer is full enough,
+ // we need to have MinBufferSize as head room.
+ num_bytes = RleEncoder::MaxBufferSize(bit_width, num_buffered_values) +
+ RleEncoder::MinBufferSize(bit_width);
+ break;
+ }
+ case Encoding::BIT_PACKED: {
+ num_bytes = BitUtil::Ceil(num_buffered_values * bit_width, 8);
+ break;
+ }
+ default:
+ throw ParquetException("Unknown encoding type for levels.");
+ }
+ return num_bytes;
+ }
+
// Initialize the LevelEncoder.
void Init(Encoding::type encoding, int16_t max_level, int num_buffered_values,
uint8_t* data, int data_size) {
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/35c8eb54/src/parquet/column/writer.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column/writer.cc b/src/parquet/column/writer.cc
index 0472adb..a23610c 100644
--- a/src/parquet/column/writer.cc
+++ b/src/parquet/column/writer.cc
@@ -62,7 +62,9 @@ void ColumnWriter::WriteRepetitionLevels(int64_t num_levels, int16_t* levels)
{
std::shared_ptr<Buffer> ColumnWriter::RleEncodeLevels(
const std::shared_ptr<Buffer>& buffer, int16_t max_level) {
// TODO: This only works with due to some RLE specifics
- int64_t rle_size = 2 * num_buffered_values_ + sizeof(uint32_t);
+ int64_t rle_size =
+ LevelEncoder::MaxBufferSize(Encoding::RLE, max_level, num_buffered_values_) +
+ sizeof(uint32_t);
auto buffer_rle = std::make_shared<OwnedMutableBuffer>(rle_size, allocator_);
level_encoder_.Init(Encoding::RLE, max_level, num_buffered_values_,
buffer_rle->mutable_data() + sizeof(uint32_t),
|