arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject arrow git commit: ARROW-1225: [Python] Decode bytes to utf8 unicode if possible when passing explicit utf8 type to pyarrow.array
Date Mon, 07 Aug 2017 14:45:46 GMT
Repository: arrow
Updated Branches:
  refs/heads/master 3200e914d -> 619472ec0


ARROW-1225: [Python] Decode bytes to utf8 unicode if possible when passing explicit utf8 type
to pyarrow.array

Author: Wes McKinney <wes.mckinney@twosigma.com>

Closes #946 from wesm/ARROW-1225 and squashes the following commits:

aa737b11 [Wes McKinney] Clearer error message
2f439285 [Wes McKinney] Decode bytes to utf8 unicode if possible when passing explicit utf8
type in pyarrow.array


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/619472ec
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/619472ec
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/619472ec

Branch: refs/heads/master
Commit: 619472ec0a6256fc6ead491fb12881b97f3acec3
Parents: 3200e91
Author: Wes McKinney <wes.mckinney@twosigma.com>
Authored: Mon Aug 7 10:45:40 2017 -0400
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Mon Aug 7 10:45:40 2017 -0400

----------------------------------------------------------------------
 cpp/src/arrow/python/builtin_convert.cc      | 22 ++++++++++++++++------
 python/pyarrow/tests/test_convert_builtin.py | 13 +++++++++++++
 2 files changed, 29 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/619472ec/cpp/src/arrow/python/builtin_convert.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc
index d3bf76d..218fe29 100644
--- a/cpp/src/arrow/python/builtin_convert.cc
+++ b/cpp/src/arrow/python/builtin_convert.cc
@@ -530,14 +530,24 @@ class UTF8Converter : public TypedConverterVisitor<StringBuilder,
UTF8Converter>
     const char* bytes;
     Py_ssize_t length;
 
-    if (item.obj() == Py_None) {
+    PyObject* obj = item.obj();
+    if (obj == Py_None) {
       return typed_builder_->AppendNull();
-    } else if (!PyUnicode_Check(item.obj())) {
-      return Status::Invalid("Non-unicode value encountered");
+    } else if (PyBytes_Check(obj)) {
+      tmp.reset(PyUnicode_FromStringAndSize(PyBytes_AS_STRING(obj),
+                                            PyBytes_GET_SIZE(obj)));
+      RETURN_IF_PYERROR();
+      bytes_obj = obj;
+    } else if (!PyUnicode_Check(obj)) {
+      PyObjectStringify stringified(obj);
+      std::stringstream ss;
+      ss << "Non bytes/unicode value encountered: " << stringified.bytes;
+      return Status::Invalid(ss.str());
+    } else {
+      tmp.reset(PyUnicode_AsUTF8String(obj));
+      RETURN_IF_PYERROR();
+      bytes_obj = tmp.obj();
     }
-    tmp.reset(PyUnicode_AsUTF8String(item.obj()));
-    RETURN_IF_PYERROR();
-    bytes_obj = tmp.obj();
 
     // No error checking
     length = PyBytes_GET_SIZE(bytes_obj);

http://git-wip-us.apache.org/repos/asf/arrow/blob/619472ec/python/pyarrow/tests/test_convert_builtin.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py
index ec26159..d18ed95 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -16,6 +16,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import pytest
+
 from pyarrow.compat import unittest, u  # noqa
 import pyarrow as pa
 
@@ -140,6 +142,17 @@ class TestConvertSequence(unittest.TestCase):
         assert arr.type == pa.binary()
         assert arr.to_pylist() == [b'foo', u1, None]
 
+    def test_utf8_to_unicode(self):
+        # ARROW-1225
+        data = [b'foo', None, b'bar']
+        arr = pa.array(data, type=pa.string())
+        assert arr[0].as_py() == u'foo'
+
+        # test a non-utf8 unicode string
+        val = (u'maƱana').encode('utf-16-le')
+        with pytest.raises(pa.ArrowException):
+            pa.array([val], type=pa.string())
+
     def test_fixed_size_bytes(self):
         data = [b'foof', None, b'barb', b'2346']
         arr = pa.array(data, type=pa.binary(4))


Mime
View raw message