arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject [04/18] arrow git commit: ARROW-1625: [Serialization] Support OrderedDict and defaultdict serialization
Date Tue, 03 Oct 2017 12:59:46 GMT
ARROW-1625: [Serialization] Support OrderedDict and defaultdict serialization

This PR adds support for OrderedDicts and default dicts using custom serialization handlers.

Author: Philipp Moritz <pcmoritz@gmail.com>

Closes #1152 from pcmoritz/pydict-exact2 and squashes the following commits:

431e0272 [Philipp Moritz] make cloudpickle optional
052b1aa9 [Philipp Moritz] I'd prefer this not to be a runtime dependency
db19ab9b [Philipp Moritz] add tests
799d983e [Philipp Moritz] do not interpret OrderedDict as dict


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/c905783f
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/c905783f
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/c905783f

Branch: refs/heads/master
Commit: c905783fd6b5a173fbf994fb5c9c17477a786554
Parents: af167fd
Author: Philipp Moritz <pcmoritz@gmail.com>
Authored: Mon Oct 2 08:09:47 2017 -0400
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Tue Oct 3 08:59:22 2017 -0400

----------------------------------------------------------------------
 ci/travis_script_python.sh                 |  1 +
 cpp/src/arrow/python/python_to_arrow.cc    |  2 +-
 python/pyarrow/tests/test_serialization.py | 38 +++++++++++++++++++++++--
 python/requirements.txt                    |  1 +
 4 files changed, 39 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/c905783f/ci/travis_script_python.sh
----------------------------------------------------------------------
diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh
index b779aec..6941543 100755
--- a/ci/travis_script_python.sh
+++ b/ci/travis_script_python.sh
@@ -36,6 +36,7 @@ which python
 
 conda install -y -q pip \
       nomkl \
+      cloudpickle \
       numpy=1.13.1 \
       pandas \
       cython \

http://git-wip-us.apache.org/repos/asf/arrow/blob/c905783f/cpp/src/arrow/python/python_to_arrow.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc
index 9ba7821..a693a08 100644
--- a/cpp/src/arrow/python/python_to_arrow.cc
+++ b/cpp/src/arrow/python/python_to_arrow.cc
@@ -469,7 +469,7 @@ Status Append(PyObject* context, PyObject* elem, SequenceBuilder* builder,
   } else if (PyList_Check(elem)) {
     RETURN_NOT_OK(builder->AppendList(PyList_Size(elem)));
     sublists->push_back(elem);
-  } else if (PyDict_Check(elem)) {
+  } else if (PyDict_CheckExact(elem)) {
     RETURN_NOT_OK(builder->AppendDict(PyDict_Size(elem)));
     subdicts->push_back(elem);
   } else if (PyTuple_CheckExact(elem)) {

http://git-wip-us.apache.org/repos/asf/arrow/blob/c905783f/python/pyarrow/tests/test_serialization.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_serialization.py b/python/pyarrow/tests/test_serialization.py
index 7c8cace..eab81c2 100644
--- a/python/pyarrow/tests/test_serialization.py
+++ b/python/pyarrow/tests/test_serialization.py
@@ -19,7 +19,7 @@ from __future__ import division
 
 import pytest
 
-from collections import namedtuple
+from collections import namedtuple, OrderedDict, defaultdict
 import string
 import sys
 
@@ -50,6 +50,12 @@ def assert_equal(obj1, obj2):
                                                                   .format(
                                                                       obj1,
                                                                       obj2))
+        try:
+            # Workaround to make comparison of OrderedDicts work on Python 2.7
+            if obj1 == obj2:
+                return
+        except:
+            pass
         for key in obj1.__dict__.keys():
             if key not in special_keys:
                 assert_equal(obj1.__dict__[key], obj2.__dict__[key])
@@ -168,7 +174,8 @@ NamedTupleExample = namedtuple("Example",
 
 CUSTOM_OBJECTS = [Exception("Test object."), CustomError(), Point(11, y=22),
                   Foo(), Bar(), Baz(), Qux(), SubQux(), SubQuxPickle(),
-                  NamedTupleExample(1, 1.0, "hi", np.zeros([3, 5]), [1, 2, 3])]
+                  NamedTupleExample(1, 1.0, "hi", np.zeros([3, 5]), [1, 2, 3]),
+                  OrderedDict([("hello", 1), ("world", 2)])]
 
 
 def make_serialization_context():
@@ -213,6 +220,28 @@ def make_serialization_context():
                               custom_serializer=lambda obj: str(obj),
                               custom_deserializer=deserializer)
 
+    def ordered_dict_custom_serializer(obj):
+        return list(obj.keys()), list(obj.values())
+
+    def ordered_dict_custom_deserializer(obj):
+        return OrderedDict(zip(obj[0], obj[1]))
+
+    context.register_type(OrderedDict, 20 * b"\x12", pickle=False,
+                          custom_serializer=ordered_dict_custom_serializer,
+                          custom_deserializer=ordered_dict_custom_deserializer)
+
+    def default_dict_custom_serializer(obj):
+        return list(obj.keys()), list(obj.values()), obj.default_factory
+
+    def default_dict_custom_deserializer(obj):
+        return defaultdict(obj[2], zip(obj[0], obj[1]))
+
+    context.register_type(defaultdict, 20 * b"\x13", pickle=False,
+                          custom_serializer=default_dict_custom_serializer,
+                          custom_deserializer=default_dict_custom_deserializer)
+
+    context.register_type(type(lambda: 0), 20 * b"\x14", pickle=True)
+
     return context
 
 
@@ -266,6 +295,11 @@ def test_custom_serialization(large_memory_map):
         for obj in CUSTOM_OBJECTS:
             serialization_roundtrip(obj, mmap)
 
+def test_default_dict_serialization(large_memory_map):
+    cloudpickle = pytest.importorskip("cloudpickle")
+    with pa.memory_map(large_memory_map, mode="r+") as mmap:
+        obj = defaultdict(lambda: 0, [("hello", 1), ("world", 2)])
+        serialization_roundtrip(obj, mmap)
 
 def test_numpy_serialization(large_memory_map):
     with pa.memory_map(large_memory_map, mode="r+") as mmap:

http://git-wip-us.apache.org/repos/asf/arrow/blob/c905783f/python/requirements.txt
----------------------------------------------------------------------
diff --git a/python/requirements.txt b/python/requirements.txt
index 103f490..d2e28a7 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -1,3 +1,4 @@
 pytest
+cloudpickle
 numpy>=1.10.0
 six


Mime
View raw message