parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject parquet-cpp git commit: PARQUET-866: API fixes for ARROW-33 patch
Date Mon, 06 Feb 2017 22:33:56 GMT
Repository: parquet-cpp
Updated Branches:
  refs/heads/master 36bda2520 -> 98c5ddaf6


PARQUET-866: API fixes for ARROW-33 patch

See ARROW-33 patch https://github.com/apache/arrow/pull/322

@xhochy this fails on Int96 timestamps. I'm not sure why yet

Author: Wes McKinney <wes.mckinney@twosigma.com>

Closes #236 from wesm/PARQUET-866 and squashes the following commits:

4966fcb [Wes McKinney] Fix off-by-one error in int96 test case
5976d59 [Wes McKinney] Update Arrow version to head with ARROW-33
b1b69b9 [Wes McKinney] clang-format
dfb2e2e [Wes McKinney] API fixes for ARROW-33 patch


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/98c5ddaf
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/98c5ddaf
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/98c5ddaf

Branch: refs/heads/master
Commit: 98c5ddaf6ac9862da6b5f71fc97c95554b04c357
Parents: 36bda25
Author: Wes McKinney <wes.mckinney@twosigma.com>
Authored: Mon Feb 6 17:33:48 2017 -0500
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Mon Feb 6 17:33:48 2017 -0500

----------------------------------------------------------------------
 cmake_modules/ThirdpartyToolchain.cmake       |  2 +-
 src/parquet/arrow/arrow-reader-writer-test.cc |  9 ++++++---
 src/parquet/arrow/reader.cc                   |  8 ++++----
 src/parquet/arrow/test-util.h                 | 21 ++++++++++-----------
 src/parquet/arrow/writer.cc                   |  6 +++---
 src/parquet/util/cpu-info.cc                  |  6 +++---
 6 files changed, 27 insertions(+), 25 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/98c5ddaf/cmake_modules/ThirdpartyToolchain.cmake
----------------------------------------------------------------------
diff --git a/cmake_modules/ThirdpartyToolchain.cmake b/cmake_modules/ThirdpartyToolchain.cmake
index c7f13b3..aa83d4b 100644
--- a/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cmake_modules/ThirdpartyToolchain.cmake
@@ -22,7 +22,7 @@ set(THRIFT_VERSION "0.9.1")
 
 # Brotli 0.5.2 does not install headers/libraries yet, but 0.6.0.dev does
 set(BROTLI_VERSION "5db62dcc9d386579609540cdf8869e95ad334bbd")
-set(ARROW_VERSION "4226adfbc6b3dff10b3fe7a6691b30bcc94140bd")
+set(ARROW_VERSION "5439b71586f4b0f9a36544b9e2417ee6ad7b48e8")
 
 # find boost headers and libs
 set(Boost_DEBUG TRUE)

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/98c5ddaf/src/parquet/arrow/arrow-reader-writer-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc b/src/parquet/arrow/arrow-reader-writer-test.cc
index 9a7fea9..63953ca 100644
--- a/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -491,7 +491,7 @@ TEST_F(TestInt96ParquetIO, ReadIntoTimestamp) {
   // 2nd January 1970, 11:35min 145738543ns
   Int96 day;
   day.value[2] = 2440589l;
-  int64_t seconds = ((1 * 24 + 11) * 60 + 35) * 60;
+  int64_t seconds = (11 * 60 + 35) * 60;
   *(reinterpret_cast<int64_t*>(&(day.value))) =
       seconds * 1000l * 1000l * 1000l + 145738543;
   // Compute the corresponding nanosecond timestamp
@@ -587,8 +587,11 @@ TEST_F(TestUInt32ParquetIO, Parquet_1_0_Compability) {
       int64_data_ptr[i] = static_cast<int64_t>(uint32_data_ptr[i]);
     }
   }
+
+  const int32_t kOffset = 0;
   ASSERT_OK(MakePrimitiveArray(std::make_shared<::arrow::Int64Type>(), values->length(),
-      int64_data, values->null_count(), values->null_bitmap(), &expected_values));
+      int64_data, values->null_bitmap(), values->null_count(), kOffset,
+      &expected_values));
   this->ReadAndCheckSingleColumnTable(expected_values);
 }
 
@@ -596,7 +599,7 @@ using TestStringParquetIO = TestParquetIO<::arrow::StringType>;
 
 TEST_F(TestStringParquetIO, EmptyStringColumnRequiredWrite) {
   std::shared_ptr<Array> values;
-  ::arrow::StringBuilder builder(::arrow::default_memory_pool(), ::arrow::utf8());
+  ::arrow::StringBuilder builder(::arrow::default_memory_pool());
   for (size_t i = 0; i < SMALL_SIZE; i++) {
     builder.Append("");
   }

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/98c5ddaf/src/parquet/arrow/reader.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/reader.cc b/src/parquet/arrow/reader.cc
index 5059494..df34d4c 100644
--- a/src/parquet/arrow/reader.cc
+++ b/src/parquet/arrow/reader.cc
@@ -616,7 +616,7 @@ Status ColumnReader::Impl::WrapIntoListArray(const int16_t* def_levels,
       auto list_type = std::make_shared<::arrow::ListType>(
           std::make_shared<Field>("item", output->type(), nullable[j + 1]));
       output = std::make_shared<::arrow::ListArray>(
-          list_type, list_lengths[j], offsets[j], output, null_counts[j], valid_bits[j]);
+          list_type, list_lengths[j], offsets[j], output, valid_bits[j], null_counts[j]);
     }
     *array = output;
   }
@@ -667,7 +667,7 @@ Status ColumnReader::Impl::TypedReadBatch(int batch_size, std::shared_ptr<Array>
           ::arrow::BitUtil::CeilByte(valid_bits_idx_) / 8, false));
     }
     *out = std::make_shared<ArrayType<ArrowType>>(
-        field_->type, valid_bits_idx_, data_buffer_, null_count_, valid_bits_buffer_);
+        field_->type, valid_bits_idx_, data_buffer_, valid_bits_buffer_, null_count_);
     // Relase the ownership as the Buffer is now part of a new Array
     valid_bits_buffer_.reset();
   } else {
@@ -741,7 +741,7 @@ Status ColumnReader::Impl::TypedReadBatch<::arrow::BooleanType, BooleanType>(
       valid_bits_buffer_ = valid_bits_buffer;
     }
     *out = std::make_shared<BooleanArray>(
-        field_->type, valid_bits_idx_, data_buffer_, null_count_, valid_bits_buffer_);
+        field_->type, valid_bits_idx_, data_buffer_, valid_bits_buffer_, null_count_);
     // Relase the ownership
     data_buffer_.reset();
     valid_bits_buffer_.reset();
@@ -770,7 +770,7 @@ Status ColumnReader::Impl::ReadByteArrayBatch(
   int16_t* rep_levels = reinterpret_cast<int16_t*>(rep_levels_buffer_.mutable_data());
 
   int values_to_read = batch_size;
-  BuilderType builder(pool_, field_->type);
+  BuilderType builder(pool_);
   while ((values_to_read > 0) && column_reader_) {
     RETURN_NOT_OK(values_buffer_.Resize(values_to_read * sizeof(ByteArray), false));
     auto reader = dynamic_cast<TypedColumnReader<ByteArrayType>*>(column_reader_.get());

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/98c5ddaf/src/parquet/arrow/test-util.h
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/test-util.h b/src/parquet/arrow/test-util.h
index 4d87dd8..bfc9ce1 100644
--- a/src/parquet/arrow/test-util.h
+++ b/src/parquet/arrow/test-util.h
@@ -47,8 +47,7 @@ typename std::enable_if<is_arrow_float<ArrowType>::value, Status>::type
NonNullA
     size_t size, std::shared_ptr<Array>* out) {
   std::vector<typename ArrowType::c_type> values;
   ::arrow::test::random_real<typename ArrowType::c_type>(size, 0, 0, 1, &values);
-  ::arrow::NumericBuilder<ArrowType> builder(
-      ::arrow::default_memory_pool(), std::make_shared<ArrowType>());
+  ::arrow::NumericBuilder<ArrowType> builder(::arrow::default_memory_pool());
   builder.Append(values.data(), values.size());
   return builder.Finish(out);
 }
@@ -58,6 +57,8 @@ typename std::enable_if<is_arrow_int<ArrowType>::value, Status>::type
NonNullArr
     size_t size, std::shared_ptr<Array>* out) {
   std::vector<typename ArrowType::c_type> values;
   ::arrow::test::randint<typename ArrowType::c_type>(size, 0, 64, &values);
+
+  // Passing data type so this will work with TimestampType too
   ::arrow::NumericBuilder<ArrowType> builder(
       ::arrow::default_memory_pool(), std::make_shared<ArrowType>());
   builder.Append(values.data(), values.size());
@@ -69,7 +70,7 @@ typename std::enable_if<
     is_arrow_string<ArrowType>::value || is_arrow_binary<ArrowType>::value, Status>::type
 NonNullArray(size_t size, std::shared_ptr<Array>* out) {
   using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
-  BuilderType builder(::arrow::default_memory_pool(), std::make_shared<ArrowType>());
+  BuilderType builder(::arrow::default_memory_pool());
   for (size_t i = 0; i < size; i++) {
     builder.Append("test-string");
   }
@@ -81,8 +82,7 @@ typename std::enable_if<is_arrow_bool<ArrowType>::value, Status>::type
NonNullAr
     size_t size, std::shared_ptr<Array>* out) {
   std::vector<uint8_t> values;
   ::arrow::test::randint<uint8_t>(size, 0, 1, &values);
-  ::arrow::BooleanBuilder builder(
-      ::arrow::default_memory_pool(), std::make_shared<::arrow::BooleanType>());
+  ::arrow::BooleanBuilder builder(::arrow::default_memory_pool());
   builder.Append(values.data(), values.size());
   return builder.Finish(out);
 }
@@ -100,8 +100,7 @@ typename std::enable_if<is_arrow_float<ArrowType>::value, Status>::type
Nullable
     valid_bytes[i * 2] = 0;
   }
 
-  ::arrow::NumericBuilder<ArrowType> builder(
-      ::arrow::default_memory_pool(), std::make_shared<ArrowType>());
+  ::arrow::NumericBuilder<ArrowType> builder(::arrow::default_memory_pool());
   builder.Append(values.data(), values.size(), valid_bytes.data());
   return builder.Finish(out);
 }
@@ -121,6 +120,7 @@ typename std::enable_if<is_arrow_int<ArrowType>::value, Status>::type
NullableAr
     valid_bytes[i * 2] = 0;
   }
 
+  // Passing data type so this will work with TimestampType too
   ::arrow::NumericBuilder<ArrowType> builder(
       ::arrow::default_memory_pool(), std::make_shared<ArrowType>());
   builder.Append(values.data(), values.size(), valid_bytes.data());
@@ -140,7 +140,7 @@ NullableArray(
   }
 
   using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
-  BuilderType builder(::arrow::default_memory_pool(), std::make_shared<ArrowType>());
+  BuilderType builder(::arrow::default_memory_pool());
 
   const int kBufferSize = 10;
   uint8_t buffer[kBufferSize];
@@ -171,8 +171,7 @@ typename std::enable_if<is_arrow_bool<ArrowType>::value, Status>::type
NullableA
     valid_bytes[i * 2] = 0;
   }
 
-  ::arrow::BooleanBuilder builder(
-      ::arrow::default_memory_pool(), std::make_shared<::arrow::BooleanType>());
+  ::arrow::BooleanBuilder builder(::arrow::default_memory_pool());
   builder.Append(values.data(), values.size(), valid_bytes.data());
   return builder.Finish(out);
 }
@@ -211,7 +210,7 @@ Status MakeListArary(const std::shared_ptr<Array>& values, int64_t
size,
   auto value_field =
       std::make_shared<::arrow::Field>("item", values->type(), nullable_values);
   *out = std::make_shared<::arrow::ListArray>(
-      ::arrow::list(value_field), size, offsets, values, null_count, null_bitmap);
+      ::arrow::list(value_field), size, offsets, values, null_bitmap, null_count);
 
   return Status::OK();
 }

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/98c5ddaf/src/parquet/arrow/writer.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc
index 7556313..0be6b69 100644
--- a/src/parquet/arrow/writer.cc
+++ b/src/parquet/arrow/writer.cc
@@ -88,10 +88,10 @@ class LevelBuilder : public ::arrow::ArrayVisitor {
   Status Visit(const ListArray& array) override {
     valid_bitmaps_.push_back(array.null_bitmap_data());
     null_counts_.push_back(array.null_count());
-    offsets_.push_back(array.raw_offsets());
+    offsets_.push_back(array.raw_value_offsets());
 
-    min_offset_idx_ = array.raw_offsets()[min_offset_idx_];
-    max_offset_idx_ = array.raw_offsets()[max_offset_idx_];
+    min_offset_idx_ = array.raw_value_offsets()[min_offset_idx_];
+    max_offset_idx_ = array.raw_value_offsets()[max_offset_idx_];
 
     return array.values()->Accept(this);
   }

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/98c5ddaf/src/parquet/util/cpu-info.cc
----------------------------------------------------------------------
diff --git a/src/parquet/util/cpu-info.cc b/src/parquet/util/cpu-info.cc
index dd31a31..ba0d146 100644
--- a/src/parquet/util/cpu-info.cc
+++ b/src/parquet/util/cpu-info.cc
@@ -132,9 +132,9 @@ void CpuInfo::Init() {
 #else
 #ifndef _SC_LEVEL1_DCACHE_SIZE
   // Provide reasonable default values if no info
-  cache_sizes_[0] = 32 * 1024;   // Level 1: 32k
-  cache_sizes_[1] = 256 * 1024;  // Level 2: 256k
-  cache_sizes_[2] = 3072 * 1024; // Level 3: 3M
+  cache_sizes_[0] = 32 * 1024;    // Level 1: 32k
+  cache_sizes_[1] = 256 * 1024;   // Level 2: 256k
+  cache_sizes_[2] = 3072 * 1024;  // Level 3: 3M
 #else
   // Call sysconf to query for the cache sizes
   cache_sizes_[0] = sysconf(_SC_LEVEL1_DCACHE_SIZE);


Mime
View raw message