arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject arrow git commit: ARROW-1665: [Serialization] Support more custom datatypes in the default serialization context
Date Thu, 12 Oct 2017 22:16:48 GMT
Repository: arrow
Updated Branches:
  refs/heads/master 434df8af0 -> 47e6ff6cf


ARROW-1665: [Serialization] Support more custom datatypes in the default serialization context

Author: Philipp Moritz <pcmoritz@gmail.com>
Author: Wes McKinney <wes.mckinney@twosigma.com>

Closes #1194 from pcmoritz/default-serialization-context and squashes the following commits:

43a4add8 [Wes McKinney] NumPy is hard requirement
7db591b8 [Philipp Moritz] update
20d75baf [Philipp Moritz] make custom serialization handlers accessible outside of the tests


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/47e6ff6c
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/47e6ff6c
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/47e6ff6c

Branch: refs/heads/master
Commit: 47e6ff6cf19a9d84d15f24715f4ddb87aa226d50
Parents: 434df8a
Author: Philipp Moritz <pcmoritz@gmail.com>
Authored: Thu Oct 12 18:16:34 2017 -0400
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Thu Oct 12 18:16:34 2017 -0400

----------------------------------------------------------------------
 python/pyarrow/__init__.py                 |   2 +
 python/pyarrow/ipc.py                      |  47 ---------
 python/pyarrow/serialization.py            | 126 ++++++++++++++++++++++++
 python/pyarrow/tests/test_serialization.py |  73 +++-----------
 4 files changed, 140 insertions(+), 108 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/47e6ff6c/python/pyarrow/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index f956347..e37c123 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -116,6 +116,8 @@ from pyarrow.ipc import (Message, MessageReader,
 
 localfs = LocalFileSystem.get_instance()
 
+from pyarrow.serialization import _default_serialization_context
+
 import pyarrow.types as types
 
 # Entry point for starting the plasma store

http://git-wip-us.apache.org/repos/asf/arrow/blob/47e6ff6c/python/pyarrow/ipc.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/ipc.py b/python/pyarrow/ipc.py
index 1223673..f264f08 100644
--- a/python/pyarrow/ipc.py
+++ b/python/pyarrow/ipc.py
@@ -187,50 +187,3 @@ def deserialize_pandas(buf, nthreads=None):
     reader = pa.RecordBatchStreamReader(buffer_reader)
     table = reader.read_all()
     return table.to_pandas(nthreads=nthreads)
-
-
-# ----------------------------------------------------------------------
-# Set up default serialization context
-
-def _serialize_pandas_series(s):
-    import pandas as pd
-    # TODO: serializing Series without extra copy
-    serialized = serialize_pandas(pd.DataFrame({s.name: s}))
-    return {
-        'type': 'Series',
-        'data': serialized.to_pybytes()
-    }
-
-
-def _serialize_pandas_dataframe(df):
-    return {
-        'type': 'DataFrame',
-        'data': serialize_pandas(df).to_pybytes()
-    }
-
-
-def _deserialize_callback_pandas(data):
-    deserialized = deserialize_pandas(data['data'])
-    type_ = data['type']
-    if type_ == 'Series':
-        return deserialized[deserialized.columns[0]]
-    elif type_ == 'DataFrame':
-        return deserialized
-    else:
-        raise ValueError(type_)
-
-
-try:
-    import pandas as pd
-    lib._default_serialization_context.register_type(
-        pd.Series, 'pandas.Series',
-        custom_serializer=_serialize_pandas_series,
-        custom_deserializer=_deserialize_callback_pandas)
-
-    lib._default_serialization_context.register_type(
-        pd.DataFrame, 'pandas.DataFrame',
-        custom_serializer=_serialize_pandas_dataframe,
-        custom_deserializer=_deserialize_callback_pandas)
-except ImportError:
-    # no pandas
-    pass

http://git-wip-us.apache.org/repos/asf/arrow/blob/47e6ff6c/python/pyarrow/serialization.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/serialization.py b/python/pyarrow/serialization.py
new file mode 100644
index 0000000..d08ae89
--- /dev/null
+++ b/python/pyarrow/serialization.py
@@ -0,0 +1,126 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from collections import OrderedDict, defaultdict
+import sys
+
+import numpy as np
+
+from pyarrow import serialize_pandas, deserialize_pandas
+from pyarrow.lib import _default_serialization_context
+
+# ----------------------------------------------------------------------
+# Set up serialization for primitive datatypes
+
+# TODO(pcm): This is currently a workaround until arrow supports
+# arbitrary precision integers. This is only called on long integers,
+# see the associated case in the append method in python_to_arrow.cc
+_default_serialization_context.register_type(
+    int, "int",
+    custom_serializer=lambda obj: str(obj),
+    custom_deserializer=lambda data: int(data))
+
+if (sys.version_info < (3, 0)):
+    _default_serialization_context.register_type(
+        long, "long",  # noqa: F821
+        custom_serializer=lambda obj: str(obj),
+        custom_deserializer=lambda data: long(data))  # noqa: F821
+
+
+def _serialize_ordered_dict(obj):
+    return list(obj.keys()), list(obj.values())
+
+
+def _deserialize_ordered_dict(data):
+    return OrderedDict(zip(data[0], data[1]))
+
+
+_default_serialization_context.register_type(
+    OrderedDict, "OrderedDict",
+    custom_serializer=_serialize_ordered_dict,
+    custom_deserializer=_deserialize_ordered_dict)
+
+
+def _serialize_default_dict(obj):
+    return list(obj.keys()), list(obj.values()), obj.default_factory
+
+
+def _deserialize_default_dict(data):
+    return defaultdict(data[2], zip(data[0], data[1]))
+
+
+_default_serialization_context.register_type(
+     defaultdict, "defaultdict",
+     custom_serializer=_serialize_default_dict,
+     custom_deserializer=_deserialize_default_dict)
+
+
+_default_serialization_context.register_type(
+     type(lambda: 0), "function",
+     pickle=True)
+
+# ----------------------------------------------------------------------
+# Set up serialization for numpy with dtype object (primitive types are
+# handled efficiently with Arrow's Tensor facilities, see python_to_arrow.cc)
+
+
+def _serialize_numpy_array(obj):
+    return obj.tolist(), obj.dtype.str
+
+
+def _deserialize_numpy_array(data):
+    return np.array(data[0], dtype=np.dtype(data[1]))
+
+
+_default_serialization_context.register_type(
+    np.ndarray, 'np.array',
+    custom_serializer=_serialize_numpy_array,
+    custom_deserializer=_deserialize_numpy_array)
+
+
+# ----------------------------------------------------------------------
+# Set up serialization for pandas Series and DataFrame
+
+try:
+    import pandas as pd
+
+    def _serialize_pandas_series(obj):
+        # TODO: serializing Series without extra copy
+        return serialize_pandas(pd.DataFrame({obj.name: obj})).to_pybytes()
+
+    def _deserialize_pandas_series(data):
+        deserialized = deserialize_pandas(data)
+        return deserialized[deserialized.columns[0]]
+
+    def _serialize_pandas_dataframe(obj):
+        return serialize_pandas(obj).to_pybytes()
+
+    def _deserialize_pandas_dataframe(data):
+        return deserialize_pandas(data)
+
+    _default_serialization_context.register_type(
+        pd.Series, 'pd.Series',
+        custom_serializer=_serialize_pandas_series,
+        custom_deserializer=_deserialize_pandas_series)
+
+    _default_serialization_context.register_type(
+        pd.DataFrame, 'pd.DataFrame',
+        custom_serializer=_serialize_pandas_dataframe,
+        custom_deserializer=_deserialize_pandas_dataframe)
+except ImportError:
+    # no pandas
+    pass

http://git-wip-us.apache.org/repos/asf/arrow/blob/47e6ff6c/python/pyarrow/tests/test_serialization.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_serialization.py b/python/pyarrow/tests/test_serialization.py
index 5441b9f..a9fd102 100644
--- a/python/pyarrow/tests/test_serialization.py
+++ b/python/pyarrow/tests/test_serialization.py
@@ -181,67 +181,18 @@ CUSTOM_OBJECTS = [Exception("Test object."), CustomError(), Point(11,
y=22),
 
 def make_serialization_context():
 
-    def array_custom_serializer(obj):
-        return obj.tolist(), obj.dtype.str
-
-    def array_custom_deserializer(serialized_obj):
-        return np.array(serialized_obj[0], dtype=np.dtype(serialized_obj[1]))
-
-    context = pa.SerializationContext()
-
-    # This is for numpy arrays of "object" only; primitive types are handled
-    # efficiently with Arrow's Tensor facilities (see python_to_arrow.cc)
-    context.register_type(np.ndarray, 20 * b"\x00",
-                          custom_serializer=array_custom_serializer,
-                          custom_deserializer=array_custom_deserializer)
-
-    context.register_type(Foo, 20 * b"\x01")
-    context.register_type(Bar, 20 * b"\x02")
-    context.register_type(Baz, 20 * b"\x03")
-    context.register_type(Qux, 20 * b"\x04")
-    context.register_type(SubQux, 20 * b"\x05")
-    context.register_type(SubQuxPickle, 20 * b"\x05", pickle=True)
-    context.register_type(Exception, 20 * b"\x06")
-    context.register_type(CustomError, 20 * b"\x07")
-    context.register_type(Point, 20 * b"\x08")
-    context.register_type(NamedTupleExample, 20 * b"\x09")
-
-    # TODO(pcm): This is currently a workaround until arrow supports
-    # arbitrary precision integers. This is only called on long integers,
-    # see the associated case in the append method in python_to_arrow.cc
-    context.register_type(int, 20 * b"\x10", pickle=False,
-                          custom_serializer=lambda obj: str(obj),
-                          custom_deserializer=(
-                              lambda serialized_obj: int(serialized_obj)))
-
-    if (sys.version_info < (3, 0)):
-        deserializer = (
-            lambda serialized_obj: long(serialized_obj))  # noqa: E501,F821
-        context.register_type(long, 20 * b"\x11", pickle=False,  # noqa: E501,F821
-                              custom_serializer=lambda obj: str(obj),
-                              custom_deserializer=deserializer)
-
-    def ordered_dict_custom_serializer(obj):
-        return list(obj.keys()), list(obj.values())
-
-    def ordered_dict_custom_deserializer(obj):
-        return OrderedDict(zip(obj[0], obj[1]))
-
-    context.register_type(OrderedDict, 20 * b"\x12", pickle=False,
-                          custom_serializer=ordered_dict_custom_serializer,
-                          custom_deserializer=ordered_dict_custom_deserializer)
-
-    def default_dict_custom_serializer(obj):
-        return list(obj.keys()), list(obj.values()), obj.default_factory
-
-    def default_dict_custom_deserializer(obj):
-        return defaultdict(obj[2], zip(obj[0], obj[1]))
-
-    context.register_type(defaultdict, 20 * b"\x13", pickle=False,
-                          custom_serializer=default_dict_custom_serializer,
-                          custom_deserializer=default_dict_custom_deserializer)
-
-    context.register_type(type(lambda: 0), 20 * b"\x14", pickle=True)
+    context = pa._default_serialization_context
+
+    context.register_type(Foo, "Foo")
+    context.register_type(Bar, "Bar")
+    context.register_type(Baz, "Baz")
+    context.register_type(Qux, "Quz")
+    context.register_type(SubQux, "SubQux")
+    context.register_type(SubQuxPickle, "SubQuxPickle", pickle=True)
+    context.register_type(Exception, "Exception")
+    context.register_type(CustomError, "CustomError")
+    context.register_type(Point, "Point")
+    context.register_type(NamedTupleExample, "NamedTupleExample")
 
     return context
 


Mime
View raw message