parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject parquet-cpp git commit: PARQUET-599: Better size estimation for levels
Date Mon, 09 May 2016 00:13:01 GMT
Repository: parquet-cpp
Updated Branches:
  refs/heads/master 8a6eca9d1 -> 35c8eb54a


PARQUET-599: Better size estimation for levels

Still not an optimal size estimation but at least we will have always the required amount.

Author: Uwe L. Korn <uwelk@xhochy.com>

Closes #96 from xhochy/parquet-599 and squashes the following commits:

e8044b5 [Uwe L. Korn] PARQUET-599: Better size estimation for levels


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/35c8eb54
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/35c8eb54
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/35c8eb54

Branch: refs/heads/master
Commit: 35c8eb54aadcc18057b25db0cc6fd22239dee908
Parents: 8a6eca9
Author: Uwe L. Korn <uwelk@xhochy.com>
Authored: Sun May 8 17:12:54 2016 -0700
Committer: Wes McKinney <wesm@apache.org>
Committed: Sun May 8 17:12:54 2016 -0700

----------------------------------------------------------------------
 src/parquet/column/levels.h  | 22 ++++++++++++++++++++++
 src/parquet/column/writer.cc |  4 +++-
 2 files changed, 25 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/35c8eb54/src/parquet/column/levels.h
----------------------------------------------------------------------
diff --git a/src/parquet/column/levels.h b/src/parquet/column/levels.h
index fd84ec9..20261df 100644
--- a/src/parquet/column/levels.h
+++ b/src/parquet/column/levels.h
@@ -31,6 +31,28 @@ class LevelEncoder {
  public:
   LevelEncoder() {}
 
+  static int MaxBufferSize(
+      Encoding::type encoding, int16_t max_level, int num_buffered_values) {
+    int bit_width = BitUtil::Log2(max_level + 1);
+    int num_bytes = 0;
+    switch (encoding) {
+      case Encoding::RLE: {
+        // TODO: Due to the way we currently check if the buffer is full enough,
+        // we need to have MinBufferSize as head room.
+        num_bytes = RleEncoder::MaxBufferSize(bit_width, num_buffered_values) +
+                    RleEncoder::MinBufferSize(bit_width);
+        break;
+      }
+      case Encoding::BIT_PACKED: {
+        num_bytes = BitUtil::Ceil(num_buffered_values * bit_width, 8);
+        break;
+      }
+      default:
+        throw ParquetException("Unknown encoding type for levels.");
+    }
+    return num_bytes;
+  }
+
   // Initialize the LevelEncoder.
   void Init(Encoding::type encoding, int16_t max_level, int num_buffered_values,
       uint8_t* data, int data_size) {

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/35c8eb54/src/parquet/column/writer.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column/writer.cc b/src/parquet/column/writer.cc
index 0472adb..a23610c 100644
--- a/src/parquet/column/writer.cc
+++ b/src/parquet/column/writer.cc
@@ -62,7 +62,9 @@ void ColumnWriter::WriteRepetitionLevels(int64_t num_levels, int16_t* levels)
{
 std::shared_ptr<Buffer> ColumnWriter::RleEncodeLevels(
     const std::shared_ptr<Buffer>& buffer, int16_t max_level) {
   // TODO: This only works with due to some RLE specifics
-  int64_t rle_size = 2 * num_buffered_values_ + sizeof(uint32_t);
+  int64_t rle_size =
+      LevelEncoder::MaxBufferSize(Encoding::RLE, max_level, num_buffered_values_) +
+      sizeof(uint32_t);
   auto buffer_rle = std::make_shared<OwnedMutableBuffer>(rle_size, allocator_);
   level_encoder_.Init(Encoding::RLE, max_level, num_buffered_values_,
       buffer_rle->mutable_data() + sizeof(uint32_t),


Mime
View raw message