parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject parquet-cpp git commit: PARQUET-1114 Apply changes for ARROW-1601 ARROW-1611, change shared l…
Date Wed, 27 Sep 2017 04:29:49 GMT
Repository: parquet-cpp
Updated Branches:
  refs/heads/master b1099e4f4 -> 4a1c2c47c


PARQUET-1114 Apply changes for ARROW-1601 ARROW-1611, change shared l…

PARQUET-1114 Apply changes for ARROW-1601 and ARROW-1611, change shared library suffix in
FindArrow.cmake for APPLE

Author: Rene Sugar <rene.sugar@gmail.com>

Closes #405 from renesugar/PARQUET-1114 and squashes the following commits:

8fc0a2e [Rene Sugar] PARQUET-1114 Apply changes for ARROW-1601 ARROW-1611, change shared library
suffix in FindArrow.cmake for APPLE


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/4a1c2c47
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/4a1c2c47
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/4a1c2c47

Branch: refs/heads/master
Commit: 4a1c2c47cd5722a389eaa39fd87edf842932428a
Parents: b1099e4
Author: Rene Sugar <rene.sugar@gmail.com>
Authored: Wed Sep 27 00:29:44 2017 -0400
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Wed Sep 27 00:29:44 2017 -0400

----------------------------------------------------------------------
 cmake_modules/FindArrow.cmake           |  6 +++-
 cmake_modules/ThirdpartyToolchain.cmake |  2 +-
 src/parquet/arrow/writer.cc             | 41 +++++++++++++++-------------
 src/parquet/column_reader.h             | 26 ++++++------------
 src/parquet/encoding-internal.h         |  7 +++--
 src/parquet/encoding.h                  |  7 +++--
 src/parquet/statistics.cc               | 11 ++++----
 7 files changed, 50 insertions(+), 50 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/4a1c2c47/cmake_modules/FindArrow.cmake
----------------------------------------------------------------------
diff --git a/cmake_modules/FindArrow.cmake b/cmake_modules/FindArrow.cmake
index eb1dfca..1ca2736 100644
--- a/cmake_modules/FindArrow.cmake
+++ b/cmake_modules/FindArrow.cmake
@@ -43,7 +43,11 @@ if ("${ARROW_HOME}" STREQUAL "")
 
     set(ARROW_LIB_NAME ${CMAKE_SHARED_LIBRARY_PREFIX}arrow)
 
-    set(ARROW_SHARED_LIB ${ARROW_LIBDIR}/${ARROW_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}${ARROW_SHARED_LIB_SUFFIX})
+    if (APPLE)
+      set(ARROW_SHARED_LIB ${ARROW_LIBDIR}/${ARROW_LIB_NAME}${ARROW_SHARED_LIB_SUFFIX}${CMAKE_SHARED_LIBRARY_SUFFIX})
+    else()
+      set(ARROW_SHARED_LIB ${ARROW_LIBDIR}/${ARROW_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}${ARROW_SHARED_LIB_SUFFIX})
+    endif()
     set(ARROW_STATIC_LIB ${ARROW_LIBDIR}/${ARROW_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX})
   endif()
 else()

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/4a1c2c47/cmake_modules/ThirdpartyToolchain.cmake
----------------------------------------------------------------------
diff --git a/cmake_modules/ThirdpartyToolchain.cmake b/cmake_modules/ThirdpartyToolchain.cmake
index 1221765..3662540 100644
--- a/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cmake_modules/ThirdpartyToolchain.cmake
@@ -353,7 +353,7 @@ if (NOT ARROW_FOUND)
     -DARROW_BUILD_TESTS=OFF)
 
   if ("$ENV{PARQUET_ARROW_VERSION}" STREQUAL "")
-    set(ARROW_VERSION "97f9029ce835dfc2655ca91b9820a2e6aed89107")
+    set(ARROW_VERSION "808a1433005ce5325ba69b1a65d05e1b547eea2c")
   else()
     set(ARROW_VERSION "$ENV{PARQUET_ARROW_VERSION}")
   endif()

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/4a1c2c47/src/parquet/arrow/writer.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc
index 7f1c45c..e834042 100644
--- a/src/parquet/arrow/writer.cc
+++ b/src/parquet/arrow/writer.cc
@@ -149,15 +149,15 @@ class LevelBuilder {
         } else if (array.null_count() == array.length()) {
           std::fill(def_levels_ptr, def_levels_ptr + array.length(), 0);
         } else {
-          const uint8_t* valid_bits = array.null_bitmap_data();
-          INIT_BITSET(valid_bits, static_cast<int>(array.offset()));
+          ::arrow::internal::BitmapReader valid_bits_reader(
+              array.null_bitmap_data(), array.offset(), array.length());
           for (int i = 0; i < array.length(); i++) {
-            if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
+            if (valid_bits_reader.IsSet()) {
               def_levels_ptr[i] = 1;
             } else {
               def_levels_ptr[i] = 0;
             }
-            READ_NEXT_BITSET(valid_bits);
+            valid_bits_reader.Next();
           }
         }
         *def_levels = def_levels_buffer_;
@@ -437,12 +437,13 @@ Status FileWriter::Impl::WriteNullableBatch(TypedColumnWriter<ParquetType>*
writ
 
   RETURN_NOT_OK(data_buffer_.Resize(num_values * sizeof(ParquetCType)));
   auto buffer_ptr = reinterpret_cast<ParquetCType*>(data_buffer_.mutable_data());
-  INIT_BITSET(valid_bits, static_cast<int>(valid_bits_offset));
+  ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset,
+                                                    num_values);
   for (int i = 0; i < num_values; i++) {
-    if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
+    if (valid_bits_reader.IsSet()) {
       buffer_ptr[i] = static_cast<ParquetCType>(data_ptr[i]);
     }
-    READ_NEXT_BITSET(valid_bits);
+    valid_bits_reader.Next();
   }
   PARQUET_CATCH_NOT_OK(writer->WriteBatchSpaced(
       num_levels, def_levels, rep_levels, valid_bits, valid_bits_offset, buffer_ptr));
@@ -458,13 +459,14 @@ Status FileWriter::Impl::WriteNullableBatch<Int32Type, ::arrow::Date64Type>(
     const int64_t* data_ptr) {
   RETURN_NOT_OK(data_buffer_.Resize(num_values * sizeof(int32_t)));
   auto buffer_ptr = reinterpret_cast<int32_t*>(data_buffer_.mutable_data());
-  INIT_BITSET(valid_bits, static_cast<int>(valid_bits_offset));
+  ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset,
+                                                    num_values);
   for (int i = 0; i < num_values; i++) {
-    if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
+    if (valid_bits_reader.IsSet()) {
       // Convert from milliseconds into days since the epoch
       buffer_ptr[i] = static_cast<int32_t>(data_ptr[i] / 86400000);
     }
-    READ_NEXT_BITSET(valid_bits);
+    valid_bits_reader.Next();
   }
   PARQUET_CATCH_NOT_OK(writer->WriteBatchSpaced(
       num_levels, def_levels, rep_levels, valid_bits, valid_bits_offset, buffer_ptr));
@@ -480,21 +482,22 @@ Status FileWriter::Impl::WriteNullableBatch<Int32Type, ::arrow::Time32Type>(
     const int32_t* data_ptr) {
   RETURN_NOT_OK(data_buffer_.Resize(num_values * sizeof(int32_t)));
   auto buffer_ptr = reinterpret_cast<int32_t*>(data_buffer_.mutable_data());
-  INIT_BITSET(valid_bits, static_cast<int>(valid_bits_offset));
+  ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset,
+                                                    num_values);
 
   if (type.unit() == TimeUnit::SECOND) {
     for (int i = 0; i < num_values; i++) {
-      if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
+      if (valid_bits_reader.IsSet()) {
         buffer_ptr[i] = data_ptr[i] * 1000;
       }
-      READ_NEXT_BITSET(valid_bits);
+      valid_bits_reader.Next();
     }
   } else {
     for (int i = 0; i < num_values; i++) {
-      if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
+      if (valid_bits_reader.IsSet()) {
         buffer_ptr[i] = data_ptr[i];
       }
-      READ_NEXT_BITSET(valid_bits);
+      valid_bits_reader.Next();
     }
   }
   PARQUET_CATCH_NOT_OK(writer->WriteBatchSpaced(
@@ -536,14 +539,14 @@ Status FileWriter::Impl::WriteNullableBatch<Int96Type, ::arrow::TimestampType>(
     const int64_t* data_ptr) {
   RETURN_NOT_OK(data_buffer_.Resize(num_values * sizeof(Int96)));
   auto buffer_ptr = reinterpret_cast<Int96*>(data_buffer_.mutable_data());
-  INIT_BITSET(valid_bits, static_cast<int>(valid_bits_offset));
-
+  ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset,
+                                                    num_values);
   if (type.unit() == TimeUnit::NANO) {
     for (int i = 0; i < num_values; i++) {
-      if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
+      if (valid_bits_reader.IsSet()) {
         internal::NanosecondsToImpalaTimestamp(data_ptr[i], buffer_ptr + i);
       }
-      READ_NEXT_BITSET(valid_bits);
+      valid_bits_reader.Next();
     }
   } else {
     return Status::NotImplemented("Only NANO timestamps are supported for Int96 writing");

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/4a1c2c47/src/parquet/column_reader.h
----------------------------------------------------------------------
diff --git a/src/parquet/column_reader.h b/src/parquet/column_reader.h
index 6172365..dcf41e8 100644
--- a/src/parquet/column_reader.h
+++ b/src/parquet/column_reader.h
@@ -148,20 +148,19 @@ static inline void DefinitionLevelsToBitmap(
     const int16_t* def_levels, int64_t num_def_levels, const int16_t max_definition_level,
     const int16_t max_repetition_level, int64_t* values_read, int64_t* null_count,
     uint8_t* valid_bits, const int64_t valid_bits_offset) {
-  int64_t byte_offset = valid_bits_offset / 8;
-  int64_t bit_offset = valid_bits_offset % 8;
-  uint8_t bitset = valid_bits[byte_offset];
+  ::arrow::internal::BitmapWriter valid_bits_writer(valid_bits, valid_bits_offset,
+                                                    num_def_levels);
 
   // TODO(itaiin): As an interim solution we are splitting the code path here
   // between repeated+flat column reads, and non-repeated+nested reads.
   // Those paths need to be merged in the future
   for (int i = 0; i < num_def_levels; ++i) {
     if (def_levels[i] == max_definition_level) {
-      bitset |= (1 << bit_offset);
+      valid_bits_writer.Set();
     } else if (max_repetition_level > 0) {
       // repetition+flat case
       if (def_levels[i] == (max_definition_level - 1)) {
-        bitset &= ~(1 << bit_offset);
+        valid_bits_writer.Clear();
         *null_count += 1;
       } else {
         continue;
@@ -169,26 +168,17 @@ static inline void DefinitionLevelsToBitmap(
     } else {
       // non-repeated+nested case
       if (def_levels[i] < max_definition_level) {
-        bitset &= ~(1 << bit_offset);
+        valid_bits_writer.Clear();
         *null_count += 1;
       } else {
         throw ParquetException("definition level exceeds maximum");
       }
     }
 
-    bit_offset++;
-    if (bit_offset == CHAR_BIT) {
-      bit_offset = 0;
-      valid_bits[byte_offset] = bitset;
-      byte_offset++;
-      // TODO: Except for the last byte, this shouldn't be needed
-      bitset = valid_bits[byte_offset];
-    }
-  }
-  if (bit_offset != 0) {
-    valid_bits[byte_offset] = bitset;
+    valid_bits_writer.Next();
   }
-  *values_read = bit_offset + byte_offset * 8 - valid_bits_offset;
+  valid_bits_writer.Finish();
+  *values_read = valid_bits_writer.position();
 }
 
 }  // namespace internal

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/4a1c2c47/src/parquet/encoding-internal.h
----------------------------------------------------------------------
diff --git a/src/parquet/encoding-internal.h b/src/parquet/encoding-internal.h
index 5818fd3..be38752 100644
--- a/src/parquet/encoding-internal.h
+++ b/src/parquet/encoding-internal.h
@@ -530,12 +530,13 @@ class DictEncoder : public Encoder<DType> {
 
   void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
                  int64_t valid_bits_offset) override {
-    INIT_BITSET(valid_bits, static_cast<int>(valid_bits_offset));
+    ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset,
+                                                      num_values);
     for (int32_t i = 0; i < num_values; i++) {
-      if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
+      if (valid_bits_reader.IsSet()) {
         Put(src[i]);
       }
-      READ_NEXT_BITSET(valid_bits);
+      valid_bits_reader.Next();
     }
   }
 

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/4a1c2c47/src/parquet/encoding.h
----------------------------------------------------------------------
diff --git a/src/parquet/encoding.h b/src/parquet/encoding.h
index e7ed415..e46ac2f 100644
--- a/src/parquet/encoding.h
+++ b/src/parquet/encoding.h
@@ -59,13 +59,14 @@ class Encoder {
       throw ParquetException(ss.str());
     }
     int32_t num_valid_values = 0;
-    INIT_BITSET(valid_bits, static_cast<int>(valid_bits_offset));
+    ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset,
+                                                      num_values);
     T* data = reinterpret_cast<T*>(buffer.mutable_data());
     for (int32_t i = 0; i < num_values; i++) {
-      if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
+      if (valid_bits_reader.IsSet()) {
         data[num_valid_values++] = src[i];
       }
-      READ_NEXT_BITSET(valid_bits);
+      valid_bits_reader.Next();
     }
     Put(data, num_valid_values);
   }

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/4a1c2c47/src/parquet/statistics.cc
----------------------------------------------------------------------
diff --git a/src/parquet/statistics.cc b/src/parquet/statistics.cc
index dad1a9b..4c69632 100644
--- a/src/parquet/statistics.cc
+++ b/src/parquet/statistics.cc
@@ -135,28 +135,29 @@ void TypedRowGroupStatistics<DType>::UpdateSpaced(const T* values,
   // TODO: support distinct count?
   if (num_not_null == 0) return;
 
-  INIT_BITSET(valid_bits, static_cast<int>(valid_bits_offset));
   // Find first valid entry and use that for min/max
   // As (num_not_null != 0) there must be one
   int64_t length = num_null + num_not_null;
   int64_t i = 0;
+  ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset,
+                                                    length);
   for (; i < length; i++) {
-    if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
+    if (valid_bits_reader.IsSet()) {
       break;
     }
-    READ_NEXT_BITSET(valid_bits);
+    valid_bits_reader.Next();
   }
   T min = values[i];
   T max = values[i];
   for (; i < length; i++) {
-    if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
+    if (valid_bits_reader.IsSet()) {
       if ((std::ref(*(this->comparator_)))(values[i], min)) {
         min = values[i];
       } else if ((std::ref(*(this->comparator_)))(max, values[i])) {
         max = values[i];
       }
     }
-    READ_NEXT_BITSET(valid_bits);
+    valid_bits_reader.Next();
   }
   if (!has_min_max_) {
     has_min_max_ = true;


Mime
View raw message