parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jul...@apache.org
Subject parquet-cpp git commit: PARQUET-503: Reenable parquet 2.0 encoding implementations.
Date Tue, 02 Feb 2016 22:50:05 GMT
Repository: parquet-cpp
Updated Branches:
  refs/heads/master 08088af76 -> ee83fad67


PARQUET-503: Reenable parquet 2.0 encoding implementations.

Author: Nong Li <nongli@gmail.com>

Closes #35 from nongli/parquet-503 and squashes the following commits:

cb2a4e1 [Nong Li] PARQUET-503: Reenable parquet 2.0 encoding implementations.


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/ee83fad6
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/ee83fad6
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/ee83fad6

Branch: refs/heads/master
Commit: ee83fad67d07977b6493dc1e7b0dde63d58b9bf8
Parents: 08088af
Author: Nong Li <nongli@gmail.com>
Authored: Tue Feb 2 14:50:00 2016 -0800
Committer: Julien Le Dem <julien@dremio.com>
Committed: Tue Feb 2 14:50:00 2016 -0800

----------------------------------------------------------------------
 .gitignore                                      |  1 +
 example/CMakeLists.txt                          |  5 ++--
 src/parquet/encodings/delta-bit-pack-encoding.h | 10 ++++----
 src/parquet/encodings/encodings.h               |  8 +++---
 src/parquet/util/bit-stream-utils.h             | 12 +++++----
 src/parquet/util/bit-stream-utils.inline.h      | 26 +++++++++-----------
 src/parquet/util/bit-util-test.cc               | 19 ++++++++++++++
 7 files changed, 49 insertions(+), 32 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ee83fad6/.gitignore
----------------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
index 172a03a..f90103a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 bin
 build
 generated
+Testing/
 
 CMakeCache.txt
 CMakeFiles

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ee83fad6/example/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index 730b408..bd9e66c 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -20,9 +20,8 @@ SET(LINK_LIBS
   snappystatic
   thriftstatic)
 
-# Disabled because decoding code has changed
-# add_executable(decode_benchmark decode_benchmark.cc)
-# target_link_libraries(decode_benchmark ${LINK_LIBS})
+add_executable(decode_benchmark decode_benchmark.cc)
+target_link_libraries(decode_benchmark ${LINK_LIBS})
 
 add_executable(parquet_reader parquet_reader.cc)
 target_link_libraries(parquet_reader ${LINK_LIBS})

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ee83fad6/src/parquet/encodings/delta-bit-pack-encoding.h
----------------------------------------------------------------------
diff --git a/src/parquet/encodings/delta-bit-pack-encoding.h b/src/parquet/encodings/delta-bit-pack-encoding.h
index a0833b5..858fcec 100644
--- a/src/parquet/encodings/delta-bit-pack-encoding.h
+++ b/src/parquet/encodings/delta-bit-pack-encoding.h
@@ -54,7 +54,7 @@ class DeltaBitPackDecoder : public Decoder<TYPE> {
   using Decoder<TYPE>::num_values_;
 
   void InitBlock() {
-    uint64_t block_size;
+    int32_t block_size;
     if (!decoder_.GetVlqInt(&block_size)) ParquetException::EofException();
     if (!decoder_.GetVlqInt(&num_mini_blocks_)) ParquetException::EofException();
     if (!decoder_.GetVlqInt(&values_current_block_)) {
@@ -104,17 +104,17 @@ class DeltaBitPackDecoder : public Decoder<TYPE> {
   }
 
   BitReader decoder_;
-  uint64_t values_current_block_;
-  uint64_t num_mini_blocks_;
+  int32_t values_current_block_;
+  int32_t num_mini_blocks_;
   uint64_t values_per_mini_block_;
   uint64_t values_current_mini_block_;
 
-  int64_t min_delta_;
+  int32_t min_delta_;
   int mini_block_idx_;
   std::vector<uint8_t> delta_bit_widths_;
   int delta_bit_width_;
 
-  int64_t last_value_;
+  int32_t last_value_;
 };
 } // namespace parquet_cpp
 

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ee83fad6/src/parquet/encodings/encodings.h
----------------------------------------------------------------------
diff --git a/src/parquet/encodings/encodings.h b/src/parquet/encodings/encodings.h
index 4fb3d9a..0d9202e 100644
--- a/src/parquet/encodings/encodings.h
+++ b/src/parquet/encodings/encodings.h
@@ -105,10 +105,8 @@ class Encoder {
 
 #include "parquet/encodings/plain-encoding.h"
 #include "parquet/encodings/dictionary-encoding.h"
-
-// The encoding tools changed and these are missing the ZigZag functions
-// #include "parquet/encodings/delta-bit-pack-encoding.h"
-// #include "parquet/encodings/delta-length-byte-array-encoding.h"
-// #include "parquet/encodings/delta-byte-array-encoding.h"
+#include "parquet/encodings/delta-bit-pack-encoding.h"
+#include "parquet/encodings/delta-length-byte-array-encoding.h"
+#include "parquet/encodings/delta-byte-array-encoding.h"
 
 #endif // PARQUET_ENCODINGS_ENCODINGS_H

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ee83fad6/src/parquet/util/bit-stream-utils.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/bit-stream-utils.h b/src/parquet/util/bit-stream-utils.h
index a02839d..3e8f95c 100644
--- a/src/parquet/util/bit-stream-utils.h
+++ b/src/parquet/util/bit-stream-utils.h
@@ -69,7 +69,10 @@ class BitWriter {
   /// room.  The value is written byte aligned.
   /// For more details on vlq:
   /// en.wikipedia.org/wiki/Variable-length_quantity
-  bool PutVlqInt(int32_t v);
+  bool PutVlqInt(uint32_t v);
+
+  // Writes an int zigzag encoded.
+  bool PutZigZagVlqInt(int32_t v);
 
   /// Get a pointer to the next aligned byte and advance the underlying buffer
   /// by num_bytes.
@@ -135,6 +138,9 @@ class BitReader {
   /// the buffer.
   bool GetVlqInt(int32_t* v);
 
+  // Reads a zigzag encoded int `into` v.
+  bool GetZigZagVlqInt(int32_t* v);
+
   /// Returns the number of bytes left in the stream, not including the current
   /// byte (i.e., there may be an additional fraction of a byte).
   int bytes_left() { return max_bytes_ - (byte_offset_ + BitUtil::Ceil(bit_offset_, 8));
}
@@ -142,10 +148,6 @@ class BitReader {
   /// Maximum byte length of a vlq encoded int
   static const int MAX_VLQ_BYTE_LEN = 5;
 
-  // TODO(nongli): implementations to be fixed given changes in Impala
-  // bool GetZigZagVlqInt(int64_t* v);
-  // bool PutZigZagVlqInt(int32_t v);
-
  private:
   const uint8_t* buffer_;
   int max_bytes_;

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ee83fad6/src/parquet/util/bit-stream-utils.inline.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/bit-stream-utils.inline.h b/src/parquet/util/bit-stream-utils.inline.h
index 77e2d48..e0dcab8 100644
--- a/src/parquet/util/bit-stream-utils.inline.h
+++ b/src/parquet/util/bit-stream-utils.inline.h
@@ -75,7 +75,7 @@ inline bool BitWriter::PutAligned(T val, int num_bytes) {
   return true;
 }
 
-inline bool BitWriter::PutVlqInt(int32_t v) {
+inline bool BitWriter::PutVlqInt(uint32_t v) {
   bool result = true;
   while ((v & 0xFFFFFF80) != 0L) {
     result &= PutAligned<uint8_t>((v & 0x7F) | 0x80, 1);
@@ -152,20 +152,18 @@ inline bool BitReader::GetVlqInt(int32_t* v) {
   return true;
 }
 
-// TODO(nongli): review/test these implementations given divergence in Impala
-// functions
-
-// inline bool BitWriter::PutZigZagVlqInt(int32_t v) {
-//   uint32_t u = (v << 1) ^ (v >> 31);
-//   return PutVlqInt(u);
-// }
+inline bool BitWriter::PutZigZagVlqInt(int32_t v) {
+  uint32_t u = (v << 1) ^ (v >> 31);
+  return PutVlqInt(u);
+}
 
-// inline bool BitReader::GetZigZagVlqInt(int64_t* v) {
-//   uint64_t u;
-//   if (!GetVlqInt(&u)) return false;
-//   *reinterpret_cast<uint64_t*>(v) = (u >> 1) ^ -(u & 1);
-//   return true;
-// }
+inline bool BitReader::GetZigZagVlqInt(int32_t* v) {
+  int32_t u_signed;
+  if (!GetVlqInt(&u_signed)) return false;
+  uint32_t u = static_cast<uint32_t>(u_signed);
+  *reinterpret_cast<uint32_t*>(v) = (u >> 1) ^ -(u & 1);
+  return true;
+}
 
 } // namespace parquet_cpp
 

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ee83fad6/src/parquet/util/bit-util-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/util/bit-util-test.cc b/src/parquet/util/bit-util-test.cc
index 78efe1a..a8b6be0 100644
--- a/src/parquet/util/bit-util-test.cc
+++ b/src/parquet/util/bit-util-test.cc
@@ -26,6 +26,7 @@
 #include <gtest/gtest.h>
 
 #include "parquet/util/bit-util.h"
+#include "parquet/util/bit-stream-utils.inline.h"
 #include "parquet/util/cpu-info.h"
 
 namespace parquet_cpp {
@@ -161,4 +162,22 @@ TEST(BitUtil, RoundUpDown) {
   EXPECT_EQ(BitUtil::RoundDownNumi64(65), 1);
 }
 
+void TestZigZag(int32_t v) {
+  uint8_t buffer[BitReader::MAX_VLQ_BYTE_LEN];
+  BitWriter writer(buffer, sizeof(buffer));
+  BitReader reader(buffer, sizeof(buffer));
+  writer.PutZigZagVlqInt(v);
+  int32_t result;
+  EXPECT_TRUE(reader.GetZigZagVlqInt(&result));
+  EXPECT_EQ(v, result);
+}
+
+TEST(BitStreamUtil, ZigZag) {
+  TestZigZag(0);
+  TestZigZag(1);
+  TestZigZag(-1);
+  TestZigZag(std::numeric_limits<int32_t>::max());
+  TestZigZag(-std::numeric_limits<int32_t>::max());
+}
+
 } // namespace parquet_cpp


Mime
View raw message