parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject parquet-cpp git commit: PARQUET-700: Disable dictionary encoding for boolean columns
Date Fri, 02 Sep 2016 15:34:41 GMT
Repository: parquet-cpp
Updated Branches:
  refs/heads/master 261072ca9 -> 5e524d146


PARQUET-700: Disable dictionary encoding for boolean columns

Author: Uwe L. Korn <uwelk@xhochy.com>

Closes #148 from xhochy/parquet-700 and squashes the following commits:

d33a670 [Uwe L. Korn] Format fixes
e8530ba [Uwe L. Korn] Also test writing booleans with Dictionary encoding
328b430 [Uwe L. Korn] Format fixes
ab33f9b [Uwe L. Korn] PARQUET-700: Disable dictionary encoding for boolean columns


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/5e524d14
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/5e524d14
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/5e524d14

Branch: refs/heads/master
Commit: 5e524d146c556b1f2ef6da6f8d9a6dbb6b8cea73
Parents: 261072c
Author: Uwe L. Korn <uwelk@xhochy.com>
Authored: Fri Sep 2 11:34:31 2016 -0400
Committer: Wes McKinney <wesm@apache.org>
Committed: Fri Sep 2 11:34:31 2016 -0400

----------------------------------------------------------------------
 src/parquet/column/column-writer-test.cc | 29 +++++++++++++--------------
 src/parquet/column/writer.cc             |  3 ++-
 2 files changed, 16 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/5e524d14/src/parquet/column/column-writer-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column/column-writer-test.cc b/src/parquet/column/column-writer-test.cc
index ab232ea..3806bd0 100644
--- a/src/parquet/column/column-writer-test.cc
+++ b/src/parquet/column/column-writer-test.cc
@@ -84,14 +84,22 @@ class TestPrimitiveWriter : public ::testing::Test {
     reader_.reset(new TypedColumnReader<TestType>(schema_.get(), std::move(page_reader)));
   }
 
-  std::unique_ptr<TypedColumnWriter<TestType>> BuildWriter(
+  std::shared_ptr<TypedColumnWriter<TestType>> BuildWriter(
       int64_t output_size = SMALL_SIZE, Encoding::type encoding = Encoding::PLAIN) {
     sink_.reset(new InMemoryOutputStream());
     std::unique_ptr<SerializedPageWriter> pager(
         new SerializedPageWriter(sink_.get(), Compression::UNCOMPRESSED, &metadata_));
-    return std::unique_ptr<TypedColumnWriter<TestType>>(
-        new TypedColumnWriter<TestType>(schema_.get(), std::move(pager), output_size,
-            encoding, writer_properties_.get()));
+    WriterProperties::Builder wp_builder;
+    if (encoding == Encoding::PLAIN_DICTIONARY || encoding == Encoding::RLE_DICTIONARY) {
+      wp_builder.enable_dictionary();
+    } else {
+      wp_builder.disable_dictionary();
+      wp_builder.encoding(encoding);
+    }
+    writer_properties_ = wp_builder.build();
+    std::shared_ptr<ColumnWriter> writer = ColumnWriter::Make(
+        schema_.get(), std::move(pager), output_size, writer_properties_.get());
+    return std::static_pointer_cast<TypedColumnWriter<TestType>>(writer);
   }
 
   void SyncValuesOut();
@@ -106,7 +114,7 @@ class TestPrimitiveWriter : public ::testing::Test {
     this->GenerateData(SMALL_SIZE);
 
     // Test case 1: required and non-repeated, so no definition or repetition levels
-    std::unique_ptr<TypedColumnWriter<TestType>> writer =
+    std::shared_ptr<TypedColumnWriter<TestType>> writer =
         this->BuildWriter(SMALL_SIZE, encoding);
     writer->WriteBatch(this->values_.size(), nullptr, nullptr, this->values_ptr_);
     // The behaviour should be independent from the number of Close() calls
@@ -191,20 +199,11 @@ typedef ::testing::Types<Int32Type, Int64Type, Int96Type, FloatType,
DoubleType,
 
 TYPED_TEST_CASE(TestPrimitiveWriter, TestTypes);
 
-// Dictionary encoding for booleans is not supported.
-typedef ::testing::Types<Int32Type, Int64Type, Int96Type, FloatType, DoubleType,
-    ByteArrayType, FLBAType> TestDictionaryTypes;
-
-template <typename T>
-class TestPrimitiveDictionaryWriter : public TestPrimitiveWriter<T> {};
-
-TYPED_TEST_CASE(TestPrimitiveDictionaryWriter, TestDictionaryTypes);
-
 TYPED_TEST(TestPrimitiveWriter, RequiredPlain) {
   this->TestRequiredWithEncoding(Encoding::PLAIN);
 }
 
-TYPED_TEST(TestPrimitiveDictionaryWriter, RequiredDictionary) {
+TYPED_TEST(TestPrimitiveWriter, RequiredDictionary) {
   this->TestRequiredWithEncoding(Encoding::PLAIN_DICTIONARY);
 }
 

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/5e524d14/src/parquet/column/writer.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column/writer.cc b/src/parquet/column/writer.cc
index 1c376ad..da4b17c 100644
--- a/src/parquet/column/writer.cc
+++ b/src/parquet/column/writer.cc
@@ -200,7 +200,8 @@ std::shared_ptr<ColumnWriter> ColumnWriter::Make(const ColumnDescriptor*
descr,
     std::unique_ptr<PageWriter> pager, int64_t expected_rows,
     const WriterProperties* properties) {
   Encoding::type encoding = properties->encoding(descr->path());
-  if (properties->dictionary_enabled(descr->path())) {
+  if (properties->dictionary_enabled(descr->path()) &&
+      descr->physical_type() != Type::BOOLEAN) {
     encoding = properties->dictionary_page_encoding();
   }
   switch (descr->physical_type()) {


Mime
View raw message