Repository: parquet-cpp
Updated Branches:
refs/heads/master 08088af76 -> ee83fad67
PARQUET-503: Reenable parquet 2.0 encoding implementations.
Author: Nong Li <nongli@gmail.com>
Closes #35 from nongli/parquet-503 and squashes the following commits:
cb2a4e1 [Nong Li] PARQUET-503: Reenable parquet 2.0 encoding implementations.
Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/ee83fad6
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/ee83fad6
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/ee83fad6
Branch: refs/heads/master
Commit: ee83fad67d07977b6493dc1e7b0dde63d58b9bf8
Parents: 08088af
Author: Nong Li <nongli@gmail.com>
Authored: Tue Feb 2 14:50:00 2016 -0800
Committer: Julien Le Dem <julien@dremio.com>
Committed: Tue Feb 2 14:50:00 2016 -0800
----------------------------------------------------------------------
.gitignore | 1 +
example/CMakeLists.txt | 5 ++--
src/parquet/encodings/delta-bit-pack-encoding.h | 10 ++++----
src/parquet/encodings/encodings.h | 8 +++---
src/parquet/util/bit-stream-utils.h | 12 +++++----
src/parquet/util/bit-stream-utils.inline.h | 26 +++++++++-----------
src/parquet/util/bit-util-test.cc | 19 ++++++++++++++
7 files changed, 49 insertions(+), 32 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ee83fad6/.gitignore
----------------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
index 172a03a..f90103a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
bin
build
generated
+Testing/
CMakeCache.txt
CMakeFiles
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ee83fad6/example/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index 730b408..bd9e66c 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -20,9 +20,8 @@ SET(LINK_LIBS
snappystatic
thriftstatic)
-# Disabled because decoding code has changed
-# add_executable(decode_benchmark decode_benchmark.cc)
-# target_link_libraries(decode_benchmark ${LINK_LIBS})
+add_executable(decode_benchmark decode_benchmark.cc)
+target_link_libraries(decode_benchmark ${LINK_LIBS})
add_executable(parquet_reader parquet_reader.cc)
target_link_libraries(parquet_reader ${LINK_LIBS})
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ee83fad6/src/parquet/encodings/delta-bit-pack-encoding.h
----------------------------------------------------------------------
diff --git a/src/parquet/encodings/delta-bit-pack-encoding.h b/src/parquet/encodings/delta-bit-pack-encoding.h
index a0833b5..858fcec 100644
--- a/src/parquet/encodings/delta-bit-pack-encoding.h
+++ b/src/parquet/encodings/delta-bit-pack-encoding.h
@@ -54,7 +54,7 @@ class DeltaBitPackDecoder : public Decoder<TYPE> {
using Decoder<TYPE>::num_values_;
void InitBlock() {
- uint64_t block_size;
+ int32_t block_size;
if (!decoder_.GetVlqInt(&block_size)) ParquetException::EofException();
if (!decoder_.GetVlqInt(&num_mini_blocks_)) ParquetException::EofException();
if (!decoder_.GetVlqInt(&values_current_block_)) {
@@ -104,17 +104,17 @@ class DeltaBitPackDecoder : public Decoder<TYPE> {
}
BitReader decoder_;
- uint64_t values_current_block_;
- uint64_t num_mini_blocks_;
+ int32_t values_current_block_;
+ int32_t num_mini_blocks_;
uint64_t values_per_mini_block_;
uint64_t values_current_mini_block_;
- int64_t min_delta_;
+ int32_t min_delta_;
int mini_block_idx_;
std::vector<uint8_t> delta_bit_widths_;
int delta_bit_width_;
- int64_t last_value_;
+ int32_t last_value_;
};
} // namespace parquet_cpp
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ee83fad6/src/parquet/encodings/encodings.h
----------------------------------------------------------------------
diff --git a/src/parquet/encodings/encodings.h b/src/parquet/encodings/encodings.h
index 4fb3d9a..0d9202e 100644
--- a/src/parquet/encodings/encodings.h
+++ b/src/parquet/encodings/encodings.h
@@ -105,10 +105,8 @@ class Encoder {
#include "parquet/encodings/plain-encoding.h"
#include "parquet/encodings/dictionary-encoding.h"
-
-// The encoding tools changed and these are missing the ZigZag functions
-// #include "parquet/encodings/delta-bit-pack-encoding.h"
-// #include "parquet/encodings/delta-length-byte-array-encoding.h"
-// #include "parquet/encodings/delta-byte-array-encoding.h"
+#include "parquet/encodings/delta-bit-pack-encoding.h"
+#include "parquet/encodings/delta-length-byte-array-encoding.h"
+#include "parquet/encodings/delta-byte-array-encoding.h"
#endif // PARQUET_ENCODINGS_ENCODINGS_H
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ee83fad6/src/parquet/util/bit-stream-utils.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/bit-stream-utils.h b/src/parquet/util/bit-stream-utils.h
index a02839d..3e8f95c 100644
--- a/src/parquet/util/bit-stream-utils.h
+++ b/src/parquet/util/bit-stream-utils.h
@@ -69,7 +69,10 @@ class BitWriter {
/// room. The value is written byte aligned.
/// For more details on vlq:
/// en.wikipedia.org/wiki/Variable-length_quantity
- bool PutVlqInt(int32_t v);
+ bool PutVlqInt(uint32_t v);
+
+ // Writes an int zigzag encoded.
+ bool PutZigZagVlqInt(int32_t v);
/// Get a pointer to the next aligned byte and advance the underlying buffer
/// by num_bytes.
@@ -135,6 +138,9 @@ class BitReader {
/// the buffer.
bool GetVlqInt(int32_t* v);
+ // Reads a zigzag encoded int `into` v.
+ bool GetZigZagVlqInt(int32_t* v);
+
/// Returns the number of bytes left in the stream, not including the current
/// byte (i.e., there may be an additional fraction of a byte).
int bytes_left() { return max_bytes_ - (byte_offset_ + BitUtil::Ceil(bit_offset_, 8));
}
@@ -142,10 +148,6 @@ class BitReader {
/// Maximum byte length of a vlq encoded int
static const int MAX_VLQ_BYTE_LEN = 5;
- // TODO(nongli): implementations to be fixed given changes in Impala
- // bool GetZigZagVlqInt(int64_t* v);
- // bool PutZigZagVlqInt(int32_t v);
-
private:
const uint8_t* buffer_;
int max_bytes_;
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ee83fad6/src/parquet/util/bit-stream-utils.inline.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/bit-stream-utils.inline.h b/src/parquet/util/bit-stream-utils.inline.h
index 77e2d48..e0dcab8 100644
--- a/src/parquet/util/bit-stream-utils.inline.h
+++ b/src/parquet/util/bit-stream-utils.inline.h
@@ -75,7 +75,7 @@ inline bool BitWriter::PutAligned(T val, int num_bytes) {
return true;
}
-inline bool BitWriter::PutVlqInt(int32_t v) {
+inline bool BitWriter::PutVlqInt(uint32_t v) {
bool result = true;
while ((v & 0xFFFFFF80) != 0L) {
result &= PutAligned<uint8_t>((v & 0x7F) | 0x80, 1);
@@ -152,20 +152,18 @@ inline bool BitReader::GetVlqInt(int32_t* v) {
return true;
}
-// TODO(nongli): review/test these implementations given divergence in Impala
-// functions
-
-// inline bool BitWriter::PutZigZagVlqInt(int32_t v) {
-// uint32_t u = (v << 1) ^ (v >> 31);
-// return PutVlqInt(u);
-// }
+inline bool BitWriter::PutZigZagVlqInt(int32_t v) {
+ uint32_t u = (v << 1) ^ (v >> 31);
+ return PutVlqInt(u);
+}
-// inline bool BitReader::GetZigZagVlqInt(int64_t* v) {
-// uint64_t u;
-// if (!GetVlqInt(&u)) return false;
-// *reinterpret_cast<uint64_t*>(v) = (u >> 1) ^ -(u & 1);
-// return true;
-// }
+inline bool BitReader::GetZigZagVlqInt(int32_t* v) {
+ int32_t u_signed;
+ if (!GetVlqInt(&u_signed)) return false;
+ uint32_t u = static_cast<uint32_t>(u_signed);
+ *reinterpret_cast<uint32_t*>(v) = (u >> 1) ^ -(u & 1);
+ return true;
+}
} // namespace parquet_cpp
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ee83fad6/src/parquet/util/bit-util-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/util/bit-util-test.cc b/src/parquet/util/bit-util-test.cc
index 78efe1a..a8b6be0 100644
--- a/src/parquet/util/bit-util-test.cc
+++ b/src/parquet/util/bit-util-test.cc
@@ -26,6 +26,7 @@
#include <gtest/gtest.h>
#include "parquet/util/bit-util.h"
+#include "parquet/util/bit-stream-utils.inline.h"
#include "parquet/util/cpu-info.h"
namespace parquet_cpp {
@@ -161,4 +162,22 @@ TEST(BitUtil, RoundUpDown) {
EXPECT_EQ(BitUtil::RoundDownNumi64(65), 1);
}
+void TestZigZag(int32_t v) {
+ uint8_t buffer[BitReader::MAX_VLQ_BYTE_LEN];
+ BitWriter writer(buffer, sizeof(buffer));
+ BitReader reader(buffer, sizeof(buffer));
+ writer.PutZigZagVlqInt(v);
+ int32_t result;
+ EXPECT_TRUE(reader.GetZigZagVlqInt(&result));
+ EXPECT_EQ(v, result);
+}
+
+TEST(BitStreamUtil, ZigZag) {
+ TestZigZag(0);
+ TestZigZag(1);
+ TestZigZag(-1);
+ TestZigZag(std::numeric_limits<int32_t>::max());
+ TestZigZag(-std::numeric_limits<int32_t>::max());
+}
+
} // namespace parquet_cpp
|