arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject arrow git commit: ARROW-1678: [Python] Implement numpy.float16 SerDe
Date Tue, 17 Oct 2017 23:21:10 GMT
Repository: arrow
Updated Branches:
  refs/heads/master 8eb2b0ee6 -> a04301833


ARROW-1678: [Python] Implement numpy.float16 SerDe

This is the patch for [ARROW-1678](https://issues.apache.org/jira/projects/ARROW/issues/ARROW-1678).

Author: Licht-T <licht-t@outlook.jp>

Closes #1205 from Licht-T/feature-halffloat-python-serde and squashes the following commits:

e397cc90 [Licht-T] Add NPY_HALF case into SerializeArray method
22c37d76 [Licht-T] Use npy_half as type of the numpy float16 method return value
319abb7c [Licht-T] Add RETURN_IF_PYERROR after PyArrayScalar_New
eec4e0a2 [Licht-T] Change headers include order
5a985b8b [Licht-T] Fix lint issue
f19e5431 [Licht-T] TST: Add tests for numpy.float16 SerDe
66d5f15a [Licht-T] ENH: Set VISIT_INLINE for HalfFloat
834357e7 [Licht-T] ENH: Add numpy.float16 deserializer
02dc77b7 [Licht-T] ENH: Add numpy.float16 serializer
9d19e086 [Licht-T] ENH: Add HalfFloat type case for numpy
dda98b9e [Licht-T] ENH: Add HalfFloat type traits
0be305a3 [Licht-T] ENH: Add the IPC internal metadata for HalfFloat
7348cc67 [Licht-T] ENH: Add HalfFloat BUILDER_CASE


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/a0430183
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/a0430183
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/a0430183

Branch: refs/heads/master
Commit: a04301833f585a66fa371394c9ee33a4a59c2ed8
Parents: 8eb2b0e
Author: Licht-T <licht-t@outlook.jp>
Authored: Tue Oct 17 19:21:05 2017 -0400
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Tue Oct 17 19:21:05 2017 -0400

----------------------------------------------------------------------
 cpp/src/arrow/builder.cc                   |  1 +
 cpp/src/arrow/ipc/metadata-internal.cc     |  4 ++++
 cpp/src/arrow/python/arrow_to_python.cc    | 13 +++++++++++++
 cpp/src/arrow/python/numpy_convert.cc      |  1 +
 cpp/src/arrow/python/numpy_to_arrow.cc     |  3 +++
 cpp/src/arrow/python/python_to_arrow.cc    | 15 +++++++++++++++
 cpp/src/arrow/python/type_traits.h         | 21 +++++++++++++++++++++
 cpp/src/arrow/visitor_inline.h             |  2 ++
 python/pyarrow/tests/test_serialization.py |  6 +++---
 9 files changed, 63 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/a0430183/cpp/src/arrow/builder.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index 076c156..331de2d 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -1438,6 +1438,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>&
type,
       BUILDER_CASE(TIME64, Time64Builder);
       BUILDER_CASE(TIMESTAMP, TimestampBuilder);
       BUILDER_CASE(BOOL, BooleanBuilder);
+      BUILDER_CASE(HALF_FLOAT, HalfFloatBuilder);
       BUILDER_CASE(FLOAT, FloatBuilder);
       BUILDER_CASE(DOUBLE, DoubleBuilder);
       BUILDER_CASE(STRING, StringBuilder);

http://git-wip-us.apache.org/repos/asf/arrow/blob/a0430183/cpp/src/arrow/ipc/metadata-internal.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc
index 162afb9..ad00cfb 100644
--- a/cpp/src/arrow/ipc/metadata-internal.cc
+++ b/cpp/src/arrow/ipc/metadata-internal.cc
@@ -381,6 +381,10 @@ static Status TypeToFlatbuffer(FBB& fbb, const DataType& type,
       INT_TO_FB_CASE(64, false);
     case Type::INT64:
       INT_TO_FB_CASE(64, true);
+    case Type::HALF_FLOAT:
+      *out_type = flatbuf::Type_FloatingPoint;
+      *offset = FloatToFlatbuffer(fbb, flatbuf::Precision_HALF);
+      break;
     case Type::FLOAT:
       *out_type = flatbuf::Type_FloatingPoint;
       *offset = FloatToFlatbuffer(fbb, flatbuf::Precision_SINGLE);

http://git-wip-us.apache.org/repos/asf/arrow/blob/a0430183/cpp/src/arrow/python/arrow_to_python.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/arrow_to_python.cc b/cpp/src/arrow/python/arrow_to_python.cc
index ac459d4..761d290 100644
--- a/cpp/src/arrow/python/arrow_to_python.cc
+++ b/cpp/src/arrow/python/arrow_to_python.cc
@@ -17,10 +17,15 @@
 
 #include "arrow/python/arrow_to_python.h"
 
+#include "arrow/python/numpy_interop.h"
+
 #include <cstdint>
 #include <memory>
 #include <vector>
 
+#include <numpy/arrayobject.h>
+#include <numpy/arrayscalars.h>
+
 #include "arrow/array.h"
 #include "arrow/io/interfaces.h"
 #include "arrow/ipc/reader.h"
@@ -121,6 +126,14 @@ Status GetValue(PyObject* context, const Array& arr, int64_t index,
int32_t type
       *result = PyUnicode_FromStringAndSize(reinterpret_cast<const char*>(str), nchars);
       return CheckPyError();
     }
+    case Type::HALF_FLOAT: {
+      *result = PyArrayScalar_New(Half);
+      RETURN_IF_PYERROR();
+
+      npy_half halffloat = static_cast<const HalfFloatArray&>(arr).Value(index);
+      PyArrayScalar_ASSIGN(*result, Half, halffloat);
+      return Status::OK();
+    }
     case Type::FLOAT:
       *result = PyFloat_FromDouble(static_cast<const FloatArray&>(arr).Value(index));
       return Status::OK();

http://git-wip-us.apache.org/repos/asf/arrow/blob/a0430183/cpp/src/arrow/python/numpy_convert.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/numpy_convert.cc b/cpp/src/arrow/python/numpy_convert.cc
index bef8054..9ed2d73 100644
--- a/cpp/src/arrow/python/numpy_convert.cc
+++ b/cpp/src/arrow/python/numpy_convert.cc
@@ -165,6 +165,7 @@ Status NumPyDtypeToArrow(PyObject* dtype, std::shared_ptr<DataType>*
out) {
 #if (NPY_UINT64 != NPY_ULONGLONG)
     TO_ARROW_CASE(ULONGLONG);
 #endif
+    TO_ARROW_TYPE_CASE(FLOAT16, float16);
     TO_ARROW_TYPE_CASE(FLOAT32, float32);
     TO_ARROW_TYPE_CASE(FLOAT64, float64);
     TO_ARROW_TYPE_CASE(STRING, binary);

http://git-wip-us.apache.org/repos/asf/arrow/blob/a0430183/cpp/src/arrow/python/numpy_to_arrow.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc
index e935b45..27ee230 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -284,6 +284,8 @@ class NumPyConverter {
     return VisitNative<T>();
   }
 
+  Status Visit(const HalfFloatType& type) { return VisitNative<UInt16Type>(); }
+
   Status Visit(const Date32Type& type) { return VisitNative<Date32Type>(); }
   Status Visit(const Date64Type& type) { return VisitNative<Int64Type>(); }
   Status Visit(const TimestampType& type) { return VisitNative<TimestampType>();
}
@@ -1183,6 +1185,7 @@ Status NumPyConverter::ConvertLists(const std::shared_ptr<DataType>&
type,
     LIST_CASE(UINT64, NPY_UINT64, UInt64Type)
     LIST_CASE(INT64, NPY_INT64, Int64Type)
     LIST_CASE(TIMESTAMP, NPY_DATETIME, TimestampType)
+    LIST_CASE(HALF_FLOAT, NPY_FLOAT16, HalfFloatType)
     LIST_CASE(FLOAT, NPY_FLOAT, FloatType)
     LIST_CASE(DOUBLE, NPY_DOUBLE, DoubleType)
     LIST_CASE(STRING, NPY_OBJECT, StringType)

http://git-wip-us.apache.org/repos/asf/arrow/blob/a0430183/cpp/src/arrow/python/python_to_arrow.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc
index e2d7452..ab444f2 100644
--- a/cpp/src/arrow/python/python_to_arrow.cc
+++ b/cpp/src/arrow/python/python_to_arrow.cc
@@ -58,6 +58,7 @@ class SequenceBuilder {
         ints_(::arrow::int64(), pool),
         bytes_(::arrow::binary(), pool),
         strings_(pool),
+        half_floats_(::arrow::float16(), pool),
         floats_(::arrow::float32(), pool),
         doubles_(::arrow::float64(), pool),
         date64s_(::arrow::date64(), pool),
@@ -117,6 +118,11 @@ class SequenceBuilder {
     return strings_.Append(data, length);
   }
 
+  /// Appending a half_float to the sequence
+  Status AppendHalfFloat(const npy_half data) {
+    return AppendPrimitive(data, &half_float_tag_, &half_floats_);
+  }
+
   /// Appending a float to the sequence
   Status AppendFloat(const float data) {
     return AppendPrimitive(data, &float_tag_, &floats_);
@@ -222,6 +228,7 @@ class SequenceBuilder {
     RETURN_NOT_OK(AddElement(int_tag_, &ints_));
     RETURN_NOT_OK(AddElement(string_tag_, &strings_));
     RETURN_NOT_OK(AddElement(bytes_tag_, &bytes_));
+    RETURN_NOT_OK(AddElement(half_float_tag_, &half_floats_));
     RETURN_NOT_OK(AddElement(float_tag_, &floats_));
     RETURN_NOT_OK(AddElement(double_tag_, &doubles_));
     RETURN_NOT_OK(AddElement(date64_tag_, &date64s_));
@@ -250,6 +257,7 @@ class SequenceBuilder {
   Int64Builder ints_;
   BinaryBuilder bytes_;
   StringBuilder strings_;
+  HalfFloatBuilder half_floats_;
   FloatBuilder floats_;
   DoubleBuilder doubles_;
   Date64Builder date64s_;
@@ -274,6 +282,7 @@ class SequenceBuilder {
   int8_t int_tag_ = -1;
   int8_t string_tag_ = -1;
   int8_t bytes_tag_ = -1;
+  int8_t half_float_tag_ = -1;
   int8_t float_tag_ = -1;
   int8_t double_tag_ = -1;
   int8_t date64_tag_ = -1;
@@ -394,6 +403,8 @@ Status SerializeSequences(PyObject* context, std::vector<PyObject*>
sequences,
 Status AppendScalar(PyObject* obj, SequenceBuilder* builder) {
   if (PyArray_IsScalar(obj, Bool)) {
     return builder->AppendBool(reinterpret_cast<PyBoolScalarObject*>(obj)->obval
!= 0);
+  } else if (PyArray_IsScalar(obj, Half)) {
+    return builder->AppendHalfFloat(reinterpret_cast<PyHalfScalarObject*>(obj)->obval);
   } else if (PyArray_IsScalar(obj, Float)) {
     return builder->AppendFloat(reinterpret_cast<PyFloatScalarObject*>(obj)->obval);
   } else if (PyArray_IsScalar(obj, Double)) {
@@ -437,6 +448,9 @@ Status Append(PyObject* context, PyObject* elem, SequenceBuilder* builder,
   // The bool case must precede the int case (PyInt_Check passes for bools)
   if (PyBool_Check(elem)) {
     RETURN_NOT_OK(builder->AppendBool(elem == Py_True));
+  } else if (PyArray_DescrFromScalar(elem)->type_num == NPY_HALF) {
+    npy_half halffloat = reinterpret_cast<PyHalfScalarObject *>(elem)->obval;
+    RETURN_NOT_OK(builder->AppendHalfFloat(halffloat));
   } else if (PyFloat_Check(elem)) {
     RETURN_NOT_OK(builder->AppendDouble(PyFloat_AS_DOUBLE(elem)));
   } else if (PyLong_Check(elem)) {
@@ -523,6 +537,7 @@ Status SerializeArray(PyObject* context, PyArrayObject* array, SequenceBuilder*
     case NPY_INT32:
     case NPY_UINT64:
     case NPY_INT64:
+    case NPY_HALF:
     case NPY_FLOAT:
     case NPY_DOUBLE: {
       RETURN_NOT_OK(builder->AppendTensor(static_cast<int32_t>(tensors_out->size())));

http://git-wip-us.apache.org/repos/asf/arrow/blob/a0430183/cpp/src/arrow/python/type_traits.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/type_traits.h b/cpp/src/arrow/python/type_traits.h
index 2cbbdf4..587b27c 100644
--- a/cpp/src/arrow/python/type_traits.h
+++ b/cpp/src/arrow/python/type_traits.h
@@ -24,6 +24,8 @@
 
 #include "arrow/python/numpy_interop.h"
 
+#include <numpy/halffloat.h>
+
 #include "arrow/builder.h"
 #include "arrow/type.h"
 #include "arrow/util/logging.h"
@@ -72,6 +74,17 @@ NPY_INT_DECL(ULONGLONG, UInt64, uint64_t);
 #endif
 
 template <>
+struct npy_traits<NPY_FLOAT16> {
+  typedef npy_half value_type;
+  using TypeClass = HalfFloatType;
+  using BuilderClass = HalfFloatBuilder;
+
+  static constexpr bool supports_nulls = true;
+
+  static inline bool isnull(npy_half v) { return v == NPY_HALF_NAN; }
+};
+
+template <>
 struct npy_traits<NPY_FLOAT32> {
   typedef float value_type;
   using TypeClass = FloatType;
@@ -144,6 +157,14 @@ INT_DECL(UINT32);
 INT_DECL(UINT64);
 
 template <>
+struct arrow_traits<Type::HALF_FLOAT> {
+  static constexpr int npy_type = NPY_FLOAT16;
+  static constexpr bool supports_nulls = true;
+  static constexpr uint16_t na_value = NPY_HALF_NAN;
+  typedef typename npy_traits<NPY_FLOAT16>::value_type T;
+};
+
+template <>
 struct arrow_traits<Type::FLOAT> {
   static constexpr int npy_type = NPY_FLOAT32;
   static constexpr bool supports_nulls = true;

http://git-wip-us.apache.org/repos/asf/arrow/blob/a0430183/cpp/src/arrow/visitor_inline.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/visitor_inline.h b/cpp/src/arrow/visitor_inline.h
index 54f9e88..5ecabd2 100644
--- a/cpp/src/arrow/visitor_inline.h
+++ b/cpp/src/arrow/visitor_inline.h
@@ -44,6 +44,7 @@ inline Status VisitTypeInline(const DataType& type, VISITOR* visitor)
{
     TYPE_VISIT_INLINE(UInt32Type);
     TYPE_VISIT_INLINE(Int64Type);
     TYPE_VISIT_INLINE(UInt64Type);
+    TYPE_VISIT_INLINE(HalfFloatType);
     TYPE_VISIT_INLINE(FloatType);
     TYPE_VISIT_INLINE(DoubleType);
     TYPE_VISIT_INLINE(StringType);
@@ -85,6 +86,7 @@ inline Status VisitArrayInline(const Array& array, VISITOR* visitor)
{
     ARRAY_VISIT_INLINE(UInt32Type);
     ARRAY_VISIT_INLINE(Int64Type);
     ARRAY_VISIT_INLINE(UInt64Type);
+    ARRAY_VISIT_INLINE(HalfFloatType);
     ARRAY_VISIT_INLINE(FloatType);
     ARRAY_VISIT_INLINE(DoubleType);
     ARRAY_VISIT_INLINE(StringType);

http://git-wip-us.apache.org/repos/asf/arrow/blob/a0430183/python/pyarrow/tests/test_serialization.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_serialization.py b/python/pyarrow/tests/test_serialization.py
index a9fd102..7e8060b 100644
--- a/python/pyarrow/tests/test_serialization.py
+++ b/python/pyarrow/tests/test_serialization.py
@@ -99,7 +99,7 @@ PRIMITIVE_OBJECTS = [
     {True: "hello", False: "world"}, {"hello": "world", 1: 42, 2.5: 45},
     {"hello": set([2, 3]), "world": set([42.0]), "this": None},
     np.int8(3), np.int32(4), np.int64(5),
-    np.uint8(3), np.uint32(4), np.uint64(5), np.float32(1.9),
+    np.uint8(3), np.uint32(4), np.uint64(5), np.float16(1.9), np.float32(1.9),
     np.float64(1.9), np.zeros([100, 100]),
     np.random.normal(size=[100, 100]), np.array(["hi", 3]),
     np.array(["hi", 3], dtype=object),
@@ -257,8 +257,8 @@ def test_default_dict_serialization(large_memory_map):
 
 def test_numpy_serialization(large_memory_map):
     with pa.memory_map(large_memory_map, mode="r+") as mmap:
-        for t in ["int8", "uint8", "int16", "uint16",
-                  "int32", "uint32", "float32", "float64"]:
+        for t in ["int8", "uint8", "int16", "uint16", "int32", "uint32",
+                  "float16", "float32", "float64"]:
             obj = np.random.randint(0, 10, size=(100, 100)).astype(t)
             serialization_roundtrip(obj, mmap)
 


Mime
View raw message