arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject arrow git commit: ARROW-1648: C++: Add cast from Dictionary[NullType] to NullType
Date Fri, 13 Oct 2017 19:34:36 GMT
Repository: arrow
Updated Branches:
  refs/heads/master 47e6ff6cf -> dc533211a


ARROW-1648: C++: Add cast from Dictionary[NullType] to NullType

Author: Korn, Uwe <Uwe.Korn@blue-yonder.com>
Author: Wes McKinney <wes.mckinney@twosigma.com>

Closes #1189 from xhochy/ARROW-1648 and squashes the following commits:

21c2e33e [Wes McKinney] Resolve rebase conflicts
8d531c82 [Korn, Uwe] Mark single argument constructor explicit
68f180f4 [Korn, Uwe] ARROW-1648: C++: Add cast from Dictionary[NullType] to NullType


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/dc533211
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/dc533211
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/dc533211

Branch: refs/heads/master
Commit: dc533211a88c9b43d250841d5cba4261af01035e
Parents: 47e6ff6
Author: Korn, Uwe <Uwe.Korn@blue-yonder.com>
Authored: Fri Oct 13 15:34:19 2017 -0400
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Fri Oct 13 15:34:19 2017 -0400

----------------------------------------------------------------------
 cpp/src/arrow/builder.cc              |  46 ++++++
 cpp/src/arrow/builder.h               |  22 +++
 cpp/src/arrow/compute/cast.cc         |  35 +++--
 cpp/src/arrow/compute/compute-test.cc |   2 +-
 cpp/src/arrow/python/util/datetime.h  | 219 ++++++++++++++---------------
 cpp/src/arrow/test-common.h           |   6 +
 6 files changed, 202 insertions(+), 128 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/dc533211/cpp/src/arrow/builder.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index 7152c7a..076c156 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -830,6 +830,16 @@ DictionaryBuilder<T>::DictionaryBuilder(const std::shared_ptr<DataType>&
type,
   }
 }
 
+DictionaryBuilder<NullType>::DictionaryBuilder(const std::shared_ptr<DataType>&
type,
+                                               MemoryPool* pool)
+    : ArrayBuilder(type, pool), values_builder_(pool) {
+  if (!::arrow::CpuInfo::initialized()) {
+    ::arrow::CpuInfo::Init();
+  }
+}
+
+DictionaryBuilder<NullType>::~DictionaryBuilder() {}
+
 template <>
 DictionaryBuilder<FixedSizeBinaryType>::DictionaryBuilder(
     const std::shared_ptr<DataType>& type, MemoryPool* pool)
@@ -858,6 +868,11 @@ Status DictionaryBuilder<T>::Init(int64_t elements) {
   return values_builder_.Init(elements);
 }
 
+Status DictionaryBuilder<NullType>::Init(int64_t elements) {
+  RETURN_NOT_OK(ArrayBuilder::Init(elements));
+  return values_builder_.Init(elements);
+}
+
 template <typename T>
 Status DictionaryBuilder<T>::Resize(int64_t capacity) {
   if (capacity < kMinBuilderCapacity) {
@@ -871,6 +886,18 @@ Status DictionaryBuilder<T>::Resize(int64_t capacity) {
   }
 }
 
+Status DictionaryBuilder<NullType>::Resize(int64_t capacity) {
+  if (capacity < kMinBuilderCapacity) {
+    capacity = kMinBuilderCapacity;
+  }
+
+  if (capacity_ == 0) {
+    return Init(capacity);
+  } else {
+    return ArrayBuilder::Resize(capacity);
+  }
+}
+
 template <typename T>
 Status DictionaryBuilder<T>::FinishInternal(std::shared_ptr<ArrayData>* out)
{
   std::shared_ptr<Array> dictionary;
@@ -881,6 +908,14 @@ Status DictionaryBuilder<T>::FinishInternal(std::shared_ptr<ArrayData>*
out) {
   return Status::OK();
 }
 
+Status DictionaryBuilder<NullType>::FinishInternal(std::shared_ptr<ArrayData>*
out) {
+  std::shared_ptr<Array> dictionary = std::make_shared<NullArray>(0);
+
+  RETURN_NOT_OK(values_builder_.FinishInternal(out));
+  (*out)->type = std::make_shared<DictionaryType>((*out)->type, dictionary);
+  return Status::OK();
+}
+
 template <typename T>
 Status DictionaryBuilder<T>::Append(const Scalar& value) {
   RETURN_NOT_OK(Reserve(1));
@@ -928,6 +963,13 @@ Status DictionaryBuilder<T>::AppendArray(const Array& array)
{
   return Status::OK();
 }
 
+Status DictionaryBuilder<NullType>::AppendArray(const Array& array) {
+  for (int64_t i = 0; i < array.length(); i++) {
+    RETURN_NOT_OK(AppendNull());
+  }
+  return Status::OK();
+}
+
 template <>
 Status DictionaryBuilder<FixedSizeBinaryType>::AppendArray(const Array& array)
{
   if (!type_->Equals(*array.type())) {
@@ -950,6 +992,8 @@ Status DictionaryBuilder<T>::AppendNull() {
   return values_builder_.AppendNull();
 }
 
+Status DictionaryBuilder<NullType>::AppendNull() { return values_builder_.AppendNull();
}
+
 template <typename T>
 Status DictionaryBuilder<T>::DoubleTableSize() {
   int new_size = hash_table_size_ * 2;
@@ -1438,6 +1482,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>&
type,
 Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>&
type,
                              std::shared_ptr<ArrayBuilder>* out) {
   switch (type->id()) {
+    DICTIONARY_BUILDER_CASE(NA, DictionaryBuilder<NullType>);
     DICTIONARY_BUILDER_CASE(UINT8, DictionaryBuilder<UInt8Type>);
     DICTIONARY_BUILDER_CASE(INT8, DictionaryBuilder<Int8Type>);
     DICTIONARY_BUILDER_CASE(UINT16, DictionaryBuilder<UInt16Type>);
@@ -1474,6 +1519,7 @@ Status EncodeArrayToDictionary(const Array& input, MemoryPool* pool,
   const std::shared_ptr<DataType>& type = input.data()->type;
   std::shared_ptr<ArrayBuilder> builder;
   switch (type->id()) {
+    DICTIONARY_ARRAY_CASE(NA, DictionaryBuilder<NullType>);
     DICTIONARY_ARRAY_CASE(UINT8, DictionaryBuilder<UInt8Type>);
     DICTIONARY_ARRAY_CASE(INT8, DictionaryBuilder<Int8Type>);
     DICTIONARY_ARRAY_CASE(UINT16, DictionaryBuilder<UInt16Type>);

http://git-wip-us.apache.org/repos/asf/arrow/blob/dc533211/cpp/src/arrow/builder.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index 54d11cf..1720c00 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -905,6 +905,28 @@ class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder {
   int32_t byte_width_;
 };
 
+template <>
+class ARROW_EXPORT DictionaryBuilder<NullType> : public ArrayBuilder {
+ public:
+  ~DictionaryBuilder();
+
+  DictionaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool);
+  explicit DictionaryBuilder(MemoryPool* pool);
+
+  /// \brief Append a scalar null value
+  Status AppendNull();
+
+  /// \brief Append a whole dense array to the builder
+  Status AppendArray(const Array& array);
+
+  Status Init(int64_t elements) override;
+  Status Resize(int64_t capacity) override;
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+ protected:
+  AdaptiveIntBuilder values_builder_;
+};
+
 class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder<BinaryType> {
  public:
   using DictionaryBuilder::Append;

http://git-wip-us.apache.org/repos/asf/arrow/blob/dc533211/cpp/src/arrow/compute/cast.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/compute/cast.cc b/cpp/src/arrow/compute/cast.cc
index 149cc36..2381e1e 100644
--- a/cpp/src/arrow/compute/cast.cc
+++ b/cpp/src/arrow/compute/cast.cc
@@ -123,6 +123,12 @@ struct CastFunctor<T, NullType, typename std::enable_if<
   }
 };
 
+template <>
+struct CastFunctor<NullType, DictionaryType> {
+  void operator()(FunctionContext* ctx, const CastOptions& options, const Array&
input,
+                  ArrayData* output) {}
+};
+
 // ----------------------------------------------------------------------
 // Boolean to other things
 
@@ -499,23 +505,25 @@ static Status AllocateIfNotPreallocated(FunctionContext* ctx, const
Array& input
       return Status::NotImplemented(ss.str());
     }
 
-    const auto& fw_type = static_cast<const FixedWidthType&>(*out->type);
+    if (type_id != Type::NA) {
+      const auto& fw_type = static_cast<const FixedWidthType&>(*out->type);
 
-    int bit_width = fw_type.bit_width();
-    int64_t buffer_size = 0;
+      int bit_width = fw_type.bit_width();
+      int64_t buffer_size = 0;
 
-    if (bit_width == 1) {
-      buffer_size = BitUtil::BytesForBits(length);
-    } else if (bit_width % 8 == 0) {
-      buffer_size = length * fw_type.bit_width() / 8;
-    } else {
-      DCHECK(false);
-    }
+      if (bit_width == 1) {
+        buffer_size = BitUtil::BytesForBits(length);
+      } else if (bit_width % 8 == 0) {
+        buffer_size = length * fw_type.bit_width() / 8;
+      } else {
+        DCHECK(false);
+      }
 
-    RETURN_NOT_OK(ctx->Allocate(buffer_size, &out_data));
-    memset(out_data->mutable_data(), 0, buffer_size);
+      RETURN_NOT_OK(ctx->Allocate(buffer_size, &out_data));
+      memset(out_data->mutable_data(), 0, buffer_size);
 
-    out->buffers.push_back(out_data);
+      out->buffers.push_back(out_data);
+    }
   }
 
   return Status::OK();
@@ -601,6 +609,7 @@ class CastKernel : public UnaryKernel {
 #define TIMESTAMP_CASES(FN, IN_TYPE) FN(TimestampType, TimestampType);
 
 #define DICTIONARY_CASES(FN, IN_TYPE) \
+  FN(IN_TYPE, NullType);              \
   FN(IN_TYPE, Time32Type);            \
   FN(IN_TYPE, Date32Type);            \
   FN(IN_TYPE, TimestampType);         \

http://git-wip-us.apache.org/repos/asf/arrow/blob/dc533211/cpp/src/arrow/compute/compute-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/compute/compute-test.cc b/cpp/src/arrow/compute/compute-test.cc
index a4b502d..602acff 100644
--- a/cpp/src/arrow/compute/compute-test.cc
+++ b/cpp/src/arrow/compute/compute-test.cc
@@ -379,7 +379,7 @@ TEST_F(TestCast, PreallocatedMemory) {
 template <typename TestType>
 class TestDictionaryCast : public TestCast {};
 
-typedef ::testing::Types<UInt8Type, Int8Type, UInt16Type, Int16Type, Int32Type,
+typedef ::testing::Types<NullType, UInt8Type, Int8Type, UInt16Type, Int16Type, Int32Type,
                          UInt32Type, UInt64Type, Int64Type, FloatType, DoubleType,
                          Date32Type, Date64Type, FixedSizeBinaryType, BinaryType>
     TestTypes;

http://git-wip-us.apache.org/repos/asf/arrow/blob/dc533211/cpp/src/arrow/python/util/datetime.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/util/datetime.h b/cpp/src/arrow/python/util/datetime.h
index 01fbc18..782960f 100644
--- a/cpp/src/arrow/python/util/datetime.h
+++ b/cpp/src/arrow/python/util/datetime.h
@@ -22,9 +22,9 @@
 #include <sstream>
 
 #include <datetime.h>
+#include "arrow/python/platform.h"
 #include "arrow/status.h"
 #include "arrow/util/logging.h"
-#include "arrow/python/platform.h"
 
 namespace arrow {
 namespace py {
@@ -34,131 +34,126 @@ namespace py {
 
 // Days per month, regular year and leap year
 static int64_t _days_per_month_table[2][12] = {
-    { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 },
-    { 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 }
-};
+    {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31},
+    {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}};
 
 static bool is_leapyear(int64_t year) {
-    return (year & 0x3) == 0 && // year % 4 == 0
-           ((year % 100) != 0 ||
-            (year % 400) == 0);
+  return (year & 0x3) == 0 &&  // year % 4 == 0
+         ((year % 100) != 0 || (year % 400) == 0);
 }
 
 // Calculates the days offset from the 1970 epoch.
-static int64_t get_days_from_date(int64_t date_year,
-                                  int64_t date_month,
+static int64_t get_days_from_date(int64_t date_year, int64_t date_month,
                                   int64_t date_day) {
-    int64_t i, month;
-    int64_t year, days = 0;
-    int64_t *month_lengths;
-
-    year = date_year - 1970;
-    days = year * 365;
-
-    // Adjust for leap years
-    if (days >= 0) {
-        // 1968 is the closest leap year before 1970.
-        // Exclude the current year, so add 1.
-        year += 1;
-        // Add one day for each 4 years
-        days += year / 4;
-        // 1900 is the closest previous year divisible by 100
-        year += 68;
-        // Subtract one day for each 100 years
-        days -= year / 100;
-        // 1600 is the closest previous year divisible by 400
-        year += 300;
-        // Add one day for each 400 years
-        days += year / 400;
-    } else {
-        // 1972 is the closest later year after 1970.
-        // Include the current year, so subtract 2.
-        year -= 2;
-        // Subtract one day for each 4 years
-        days += year / 4;
-        // 2000 is the closest later year divisible by 100
-        year -= 28;
-        // Add one day for each 100 years
-        days -= year / 100;
-        // 2000 is also the closest later year divisible by 400
-        // Subtract one day for each 400 years
-        days += year / 400;
-    }
+  int64_t i, month;
+  int64_t year, days = 0;
+  int64_t* month_lengths;
+
+  year = date_year - 1970;
+  days = year * 365;
+
+  // Adjust for leap years
+  if (days >= 0) {
+    // 1968 is the closest leap year before 1970.
+    // Exclude the current year, so add 1.
+    year += 1;
+    // Add one day for each 4 years
+    days += year / 4;
+    // 1900 is the closest previous year divisible by 100
+    year += 68;
+    // Subtract one day for each 100 years
+    days -= year / 100;
+    // 1600 is the closest previous year divisible by 400
+    year += 300;
+    // Add one day for each 400 years
+    days += year / 400;
+  } else {
+    // 1972 is the closest later year after 1970.
+    // Include the current year, so subtract 2.
+    year -= 2;
+    // Subtract one day for each 4 years
+    days += year / 4;
+    // 2000 is the closest later year divisible by 100
+    year -= 28;
+    // Add one day for each 100 years
+    days -= year / 100;
+    // 2000 is also the closest later year divisible by 400
+    // Subtract one day for each 400 years
+    days += year / 400;
+  }
 
-    month_lengths = _days_per_month_table[is_leapyear(date_year)];
-    month = date_month - 1;
+  month_lengths = _days_per_month_table[is_leapyear(date_year)];
+  month = date_month - 1;
 
-    // Add the months
-    for (i = 0; i < month; ++i) {
-        days += month_lengths[i];
-    }
+  // Add the months
+  for (i = 0; i < month; ++i) {
+    days += month_lengths[i];
+  }
 
-    // Add the days
-    days += date_day - 1;
+  // Add the days
+  days += date_day - 1;
 
-    return days;
+  return days;
 }
 
 // Modifies '*days_' to be the day offset within the year,
 // and returns the year.
 static int64_t days_to_yearsdays(int64_t* days_) {
-    const int64_t days_per_400years = (400*365 + 100 - 4 + 1);
-    // Adjust so it's relative to the year 2000 (divisible by 400)
-    int64_t days = (*days_) - (365*30 + 7);
-    int64_t year;
-
-    // Break down the 400 year cycle to get the year and day within the year
-    if (days >= 0) {
-        year = 400 * (days / days_per_400years);
-        days = days % days_per_400years;
-    } else {
-        year = 400 * ((days - (days_per_400years - 1)) / days_per_400years);
-        days = days % days_per_400years;
-        if (days < 0) {
-            days += days_per_400years;
-        }
+  const int64_t days_per_400years = (400 * 365 + 100 - 4 + 1);
+  // Adjust so it's relative to the year 2000 (divisible by 400)
+  int64_t days = (*days_) - (365 * 30 + 7);
+  int64_t year;
+
+  // Break down the 400 year cycle to get the year and day within the year
+  if (days >= 0) {
+    year = 400 * (days / days_per_400years);
+    days = days % days_per_400years;
+  } else {
+    year = 400 * ((days - (days_per_400years - 1)) / days_per_400years);
+    days = days % days_per_400years;
+    if (days < 0) {
+      days += days_per_400years;
     }
+  }
 
-    // Work out the year/day within the 400 year cycle
-    if (days >= 366) {
-        year += 100 * ((days-1) / (100*365 + 25 - 1));
-        days = (days-1) % (100*365 + 25 - 1);
-        if (days >= 365) {
-            year += 4 * ((days+1) / (4*365 + 1));
-            days = (days+1) % (4*365 + 1);
-            if (days >= 366) {
-                year += (days-1) / 365;
-                days = (days-1) % 365;
-            }
-        }
+  // Work out the year/day within the 400 year cycle
+  if (days >= 366) {
+    year += 100 * ((days - 1) / (100 * 365 + 25 - 1));
+    days = (days - 1) % (100 * 365 + 25 - 1);
+    if (days >= 365) {
+      year += 4 * ((days + 1) / (4 * 365 + 1));
+      days = (days + 1) % (4 * 365 + 1);
+      if (days >= 366) {
+        year += (days - 1) / 365;
+        days = (days - 1) % 365;
+      }
     }
+  }
 
-    *days_ = days;
-    return year + 2000;
+  *days_ = days;
+  return year + 2000;
 }
 
 // Extracts the month and year and day number from a number of days
-static void get_date_from_days(int64_t days,
-                               int64_t* date_year,
-                               int64_t* date_month,
+static void get_date_from_days(int64_t days, int64_t* date_year, int64_t* date_month,
                                int64_t* date_day) {
-    int64_t *month_lengths, i;
-
-    *date_year = days_to_yearsdays(&days);
-    month_lengths = _days_per_month_table[is_leapyear(*date_year)];
-
-    for (i = 0; i < 12; ++i) {
-        if (days < month_lengths[i]) {
-            *date_month = i + 1;
-            *date_day = days + 1;
-            return;
-        } else {
-            days -= month_lengths[i];
-        }
+  int64_t *month_lengths, i;
+
+  *date_year = days_to_yearsdays(&days);
+  month_lengths = _days_per_month_table[is_leapyear(*date_year)];
+
+  for (i = 0; i < 12; ++i) {
+    if (days < month_lengths[i]) {
+      *date_month = i + 1;
+      *date_day = days + 1;
+      return;
+    } else {
+      days -= month_lengths[i];
     }
+  }
 
-    // Should never get here
-    return;
+  // Should never get here
+  return;
 }
 
 static inline int64_t PyTime_to_us(PyObject* pytime) {
@@ -168,7 +163,6 @@ static inline int64_t PyTime_to_us(PyObject* pytime) {
           PyDateTime_TIME_GET_MICROSECOND(pytime));
 }
 
-
 // Splitting time quantities, for example splitting total seconds into
 // minutes and remaining seconds. After we run
 // int64_t remaining = split_time(total, quotient, &next)
@@ -188,8 +182,8 @@ static inline int64_t split_time(int64_t total, int64_t quotient, int64_t*
next)
 }
 
 static inline Status PyTime_convert_int(int64_t val, const TimeUnit::type unit,
-                                        int64_t *hour, int64_t *minute,
-                                        int64_t *second, int64_t *microsecond) {
+                                        int64_t* hour, int64_t* minute, int64_t* second,
+                                        int64_t* microsecond) {
   switch (unit) {
     case TimeUnit::NANO:
       if (val % 1000 != 0) {
@@ -234,13 +228,10 @@ static inline Status PyDateTime_from_int(int64_t val, const TimeUnit::type
unit,
   hour = split_time(hour, 24, &total_days);
   int64_t year = 0, month = 0, day = 0;
   get_date_from_days(total_days, &year, &month, &day);
-  *out = PyDateTime_FromDateAndTime(static_cast<int32_t>(year),
-                                    static_cast<int32_t>(month),
-                                    static_cast<int32_t>(day),
-                                    static_cast<int32_t>(hour),
-                                    static_cast<int32_t>(minute),
-                                    static_cast<int32_t>(second),
-                                    static_cast<int32_t>(microsecond));
+  *out = PyDateTime_FromDateAndTime(
+      static_cast<int32_t>(year), static_cast<int32_t>(month), static_cast<int32_t>(day),
+      static_cast<int32_t>(hour), static_cast<int32_t>(minute),
+      static_cast<int32_t>(second), static_cast<int32_t>(microsecond));
   return Status::OK();
 }
 
@@ -249,9 +240,9 @@ static inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) {
   total_seconds += PyDateTime_DATE_GET_SECOND(pydate);
   total_seconds += PyDateTime_DATE_GET_MINUTE(pydate) * 60;
   total_seconds += PyDateTime_DATE_GET_HOUR(pydate) * 3600;
-  int64_t days = get_days_from_date(PyDateTime_GET_YEAR(pydate),
-                                    PyDateTime_GET_MONTH(pydate),
-                                    PyDateTime_GET_DAY(pydate));
+  int64_t days =
+      get_days_from_date(PyDateTime_GET_YEAR(pydate), PyDateTime_GET_MONTH(pydate),
+                         PyDateTime_GET_DAY(pydate));
   total_seconds += days * 24 * 3600;
   return total_seconds * 1000;
 }

http://git-wip-us.apache.org/repos/asf/arrow/blob/dc533211/cpp/src/arrow/test-common.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/test-common.h b/cpp/src/arrow/test-common.h
index 3dc39fc..a4c4fdd 100644
--- a/cpp/src/arrow/test-common.h
+++ b/cpp/src/arrow/test-common.h
@@ -75,6 +75,12 @@ std::shared_ptr<Array> TestBase::MakeRandomArray(int64_t length,
int64_t null_co
 }
 
 template <>
+std::shared_ptr<Array> TestBase::MakeRandomArray<NullArray>(int64_t length,
+                                                            int64_t null_count) {
+  return std::make_shared<NullArray>(length);
+}
+
+template <>
 std::shared_ptr<Array> TestBase::MakeRandomArray<FixedSizeBinaryArray>(
     int64_t length, int64_t null_count) {
   const int byte_width = 10;


Mime
View raw message