arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject arrow git commit: ARROW-30: [Python] Routines for converting between arrow::Array/Table and pandas.DataFrame
Date Mon, 28 Mar 2016 16:36:28 GMT
Repository: arrow
Updated Branches:
  refs/heads/master 017187749 -> 1fd0668a1


ARROW-30: [Python] Routines for converting between arrow::Array/Table and pandas.DataFrame

There is a lot to do here for maximum compatibility, but this gets things started.

Author: Wes McKinney <wesm@apache.org>

Closes #46 from wesm/ARROW-30 and squashes the following commits:

0a9e747 [Wes McKinney] Invoke py.test with python -m pytest
4c9f766 [Wes McKinney] More scaffolding. Table wrapper. Initial unit tests passing
8475a0e [Wes McKinney] More pandas conversion scaffolding, enable libpyarrow to use the NumPy C API globally
d1f05c5 [Wes McKinney] cpplint
f0cc451 [Wes McKinney] Give libpyarrow a reference to numpy.nan
5e09bfe [Wes McKinney] Compiling, but untested draft of pandas <-> arrow converters


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/1fd0668a
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/1fd0668a
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/1fd0668a

Branch: refs/heads/master
Commit: 1fd0668a1330e72b1b137d90d00906bc188243e0
Parents: 0171877
Author: Wes McKinney <wesm@apache.org>
Authored: Mon Mar 28 09:36:20 2016 -0700
Committer: Wes McKinney <wesm@apache.org>
Committed: Mon Mar 28 09:36:20 2016 -0700

----------------------------------------------------------------------
 ci/travis_script_python.sh                  |   8 +-
 cpp/README.md                               |   6 +-
 cpp/src/arrow/array.h                       |  13 +-
 cpp/src/arrow/types/string.cc               |  10 +
 cpp/src/arrow/types/string.h                |   4 +-
 cpp/src/arrow/util/buffer.h                 |  42 ++
 python/CMakeLists.txt                       |   6 +-
 python/pyarrow/__init__.py                  |   8 +-
 python/pyarrow/array.pyx                    | 135 +++++
 python/pyarrow/config.pyx                   |  13 +-
 python/pyarrow/includes/common.pxd          |   6 +
 python/pyarrow/includes/libarrow.pxd        |  52 +-
 python/pyarrow/includes/pyarrow.pxd         |   9 +-
 python/pyarrow/tests/test_convert_pandas.py | 172 ++++++
 python/src/pyarrow/adapters/pandas.cc       | 714 +++++++++++++++++++++++
 python/src/pyarrow/adapters/pandas.h        |  21 +
 python/src/pyarrow/common.h                 |  23 +-
 python/src/pyarrow/config.cc                |  34 ++
 python/src/pyarrow/config.h                 |  39 ++
 python/src/pyarrow/do_import_numpy.h        |  21 +
 python/src/pyarrow/init.cc                  |  25 -
 python/src/pyarrow/init.h                   |  27 -
 python/src/pyarrow/numpy_interop.h          |  58 ++
 23 files changed, 1355 insertions(+), 91 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/ci/travis_script_python.sh
----------------------------------------------------------------------
diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh
index af6b008..d45b895 100755
--- a/ci/travis_script_python.sh
+++ b/ci/travis_script_python.sh
@@ -48,17 +48,11 @@ python_version_tests() {
 
   python setup.py build_ext --inplace
 
-  py.test -vv -r sxX pyarrow
+  python -m pytest -vv -r sxX pyarrow
 }
 
 # run tests for python 2.7 and 3.5
 python_version_tests 2.7
 python_version_tests 3.5
 
-# if [ $TRAVIS_OS_NAME == "linux" ]; then
-#   valgrind --tool=memcheck py.test -vv -r sxX arrow
-# else
-#   py.test -vv -r sxX arrow
-# fi
-
 popd

http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/cpp/README.md
----------------------------------------------------------------------
diff --git a/cpp/README.md b/cpp/README.md
index 542cce4..9026cf9 100644
--- a/cpp/README.md
+++ b/cpp/README.md
@@ -42,12 +42,12 @@ Detailed unit test logs will be placed in the build directory under `build/test-
 
 ### Building/Running benchmarks
 
-Follow the directions for simple build except run cmake 
+Follow the directions for simple build except run cmake
 with the `--ARROW_BUILD_BENCHMARKS` parameter set correctly:
 
     cmake -DARROW_BUILD_BENCHMARKS=ON ..
 
-and instead of make unittest run either `make; ctest` to run both unit tests 
+and instead of make unittest run either `make; ctest` to run both unit tests
 and benchmarks or `make runbenchmark` to run only the benchmark tests.
 
 Benchmark logs will be placed in the build directory under `build/benchmark-logs`.
@@ -60,4 +60,4 @@ variables
 
 * Googletest: `GTEST_HOME` (only required to build the unit tests)
 * Google Benchmark: `GBENCHMARK_HOME` (only required if building benchmarks)
-
+* Flatbuffers: `FLATBUFFERS_HOME` (only required for the IPC extensions)

http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/cpp/src/arrow/array.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index 133adf3..097634d 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -34,13 +34,10 @@ class Buffer;
 //
 // The base class is only required to have a null bitmap buffer if the null
 // count is greater than 0
-//
-// Any buffers used to initialize the array have their references "stolen". If
-// you wish to use the buffer beyond the lifetime of the array, you need to
-// explicitly increment its reference count
 class Array {
  public:
-  Array(const TypePtr& type, int32_t length, int32_t null_count = 0,
+  Array(const std::shared_ptr<DataType>& type, int32_t length,
+      int32_t null_count = 0,
       const std::shared_ptr<Buffer>& null_bitmap = nullptr);
 
   virtual ~Array() {}
@@ -60,11 +57,15 @@ class Array {
     return null_bitmap_;
   }
 
+  const uint8_t* null_bitmap_data() const {
+    return null_bitmap_data_;
+  }
+
   bool EqualsExact(const Array& arr) const;
   virtual bool Equals(const std::shared_ptr<Array>& arr) const = 0;
 
  protected:
-  TypePtr type_;
+  std::shared_ptr<DataType> type_;
   int32_t null_count_;
   int32_t length_;
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/cpp/src/arrow/types/string.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/string.cc b/cpp/src/arrow/types/string.cc
index dea42e1..80b075c 100644
--- a/cpp/src/arrow/types/string.cc
+++ b/cpp/src/arrow/types/string.cc
@@ -20,8 +20,18 @@
 #include <sstream>
 #include <string>
 
+#include "arrow/type.h"
+
 namespace arrow {
 
+const std::shared_ptr<DataType> STRING(new StringType());
+
+StringArray::StringArray(int32_t length,
+    const std::shared_ptr<Buffer>& offsets,
+    const ArrayPtr& values, int32_t null_count,
+    const std::shared_ptr<Buffer>& null_bitmap) :
+    StringArray(STRING, length, offsets, values, null_count, null_bitmap) {}
+
 std::string CharType::ToString() const {
   std::stringstream s;
   s << "char(" << size << ")";

http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/cpp/src/arrow/types/string.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h
index fda722b..84cd032 100644
--- a/cpp/src/arrow/types/string.h
+++ b/cpp/src/arrow/types/string.h
@@ -79,9 +79,7 @@ class StringArray : public ListArray {
       const std::shared_ptr<Buffer>& offsets,
       const ArrayPtr& values,
       int32_t null_count = 0,
-      const std::shared_ptr<Buffer>& null_bitmap = nullptr) :
-      StringArray(std::make_shared<StringType>(), length, offsets, values,
-          null_count, null_bitmap) {}
+      const std::shared_ptr<Buffer>& null_bitmap = nullptr);
 
   // Compute the pointer t
   const uint8_t* GetValue(int i, int32_t* out_length) const {

http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/cpp/src/arrow/util/buffer.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/buffer.h b/cpp/src/arrow/util/buffer.h
index 0c3e210..c15f9b6 100644
--- a/cpp/src/arrow/util/buffer.h
+++ b/cpp/src/arrow/util/buffer.h
@@ -18,11 +18,13 @@
 #ifndef ARROW_UTIL_BUFFER_H
 #define ARROW_UTIL_BUFFER_H
 
+#include <algorithm>
 #include <cstdint>
 #include <cstring>
 #include <memory>
 
 #include "arrow/util/macros.h"
+#include "arrow/util/status.h"
 
 namespace arrow {
 
@@ -146,6 +148,46 @@ class PoolBuffer : public ResizableBuffer {
   MemoryPool* pool_;
 };
 
+static constexpr int64_t MIN_BUFFER_CAPACITY = 1024;
+
+class BufferBuilder {
+ public:
+  explicit BufferBuilder(MemoryPool* pool) :
+      pool_(pool),
+      capacity_(0),
+      size_(0) {}
+
+  Status Append(const uint8_t* data, int length) {
+    if (capacity_ < length + size_) {
+      if (capacity_ == 0) {
+        buffer_ = std::make_shared<PoolBuffer>(pool_);
+      }
+      capacity_ = std::max(MIN_BUFFER_CAPACITY, capacity_);
+      while (capacity_ < length + size_) {
+        capacity_ *= 2;
+      }
+      RETURN_NOT_OK(buffer_->Resize(capacity_));
+      data_ = buffer_->mutable_data();
+    }
+    memcpy(data_ + size_, data, length);
+    size_ += length;
+    return Status::OK();
+  }
+
+  std::shared_ptr<Buffer> Finish() {
+    auto result = buffer_;
+    buffer_ = nullptr;
+    return result;
+  }
+
+ private:
+  std::shared_ptr<PoolBuffer> buffer_;
+  MemoryPool* pool_;
+  uint8_t* data_;
+  int64_t capacity_;
+  int64_t size_;
+};
+
 } // namespace arrow
 
 #endif // ARROW_UTIL_BUFFER_H

http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 0ecafc7..ebe825f 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -220,9 +220,12 @@ set(EXECUTABLE_OUTPUT_PATH "${BUILD_OUTPUT_ROOT_DIRECTORY}")
 
 ## Python and libraries
 find_package(PythonLibsNew REQUIRED)
+find_package(NumPy REQUIRED)
 include(UseCython)
 
 include_directories(SYSTEM
+  ${NUMPY_INCLUDE_DIRS}
+  ${PYTHON_INCLUDE_DIRS}
   src)
 
 ############################################################
@@ -409,11 +412,12 @@ add_subdirectory(src/pyarrow/util)
 
 set(PYARROW_SRCS
   src/pyarrow/common.cc
+  src/pyarrow/config.cc
   src/pyarrow/helpers.cc
-  src/pyarrow/init.cc
   src/pyarrow/status.cc
 
   src/pyarrow/adapters/builtin.cc
+  src/pyarrow/adapters/pandas.cc
 )
 
 set(LINK_LIBS

http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/pyarrow/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 9a08070..c343f5b 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -17,7 +17,11 @@
 
 # flake8: noqa
 
-from pyarrow.array import (Array, from_pylist, total_allocated_bytes,
+import pyarrow.config
+
+from pyarrow.array import (Array,
+                           from_pandas_series, from_pylist,
+                           total_allocated_bytes,
                            BooleanArray, NumericArray,
                            Int8Array, UInt8Array,
                            ListArray, StringArray)
@@ -37,4 +41,4 @@ from pyarrow.schema import (null, bool_,
                             list_, struct, field,
                             DataType, Field, Schema, schema)
 
-from pyarrow.array import RowBatch
+from pyarrow.array import RowBatch, Table, from_pandas_dataframe

http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/pyarrow/array.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx
index c5d40dd..88770cd 100644
--- a/python/pyarrow/array.pyx
+++ b/python/pyarrow/array.pyx
@@ -22,6 +22,8 @@
 from pyarrow.includes.libarrow cimport *
 cimport pyarrow.includes.pyarrow as pyarrow
 
+import pyarrow.config
+
 from pyarrow.compat import frombytes, tobytes
 from pyarrow.error cimport check_status
 
@@ -44,6 +46,10 @@ cdef class Array:
         self.type = DataType()
         self.type.init(self.sp_array.get().type())
 
+    @staticmethod
+    def from_pandas(obj, mask=None):
+        return from_pandas_series(obj, mask)
+
     property null_count:
 
         def __get__(self):
@@ -160,7 +166,15 @@ cdef class StringArray(Array):
 cdef dict _array_classes = {
     Type_NA: NullArray,
     Type_BOOL: BooleanArray,
+    Type_UINT8: UInt8Array,
+    Type_UINT16: UInt16Array,
+    Type_UINT32: UInt32Array,
+    Type_UINT64: UInt64Array,
+    Type_INT8: Int8Array,
+    Type_INT16: Int16Array,
+    Type_INT32: Int32Array,
     Type_INT64: Int64Array,
+    Type_FLOAT: FloatArray,
     Type_DOUBLE: DoubleArray,
     Type_LIST: ListArray,
     Type_STRING: StringArray,
@@ -194,6 +208,49 @@ def from_pylist(object list_obj, DataType type=None):
 
     return box_arrow_array(sp_array)
 
+
+def from_pandas_series(object series, object mask=None):
+    cdef:
+        shared_ptr[CArray] out
+
+    series_values = series_as_ndarray(series)
+
+    if mask is None:
+        check_status(pyarrow.PandasToArrow(pyarrow.GetMemoryPool(),
+                                           series_values, &out))
+    else:
+        mask = series_as_ndarray(mask)
+        check_status(pyarrow.PandasMaskedToArrow(
+            pyarrow.GetMemoryPool(), series_values, mask, &out))
+
+    return box_arrow_array(out)
+
+
+def from_pandas_dataframe(object df, name=None):
+    cdef:
+        list names = []
+        list arrays = []
+
+    for name in df.columns:
+        col = df[name]
+        arr = from_pandas_series(col)
+
+        names.append(name)
+        arrays.append(arr)
+
+    return Table.from_arrays(names, arrays, name=name)
+
+
+cdef object series_as_ndarray(object obj):
+    import pandas as pd
+
+    if isinstance(obj, pd.Series):
+        result = obj.values
+    else:
+        result = obj
+
+    return result
+
 #----------------------------------------------------------------------
 # Table-like data structures
 
@@ -225,3 +282,81 @@ cdef class RowBatch:
 
     def __getitem__(self, i):
         return self.arrays[i]
+
+
+cdef class Table:
+    '''
+    Do not call this class's constructor directly.
+    '''
+    cdef:
+        shared_ptr[CTable] sp_table
+        CTable* table
+
+    def __cinit__(self):
+        pass
+
+    cdef init(self, const shared_ptr[CTable]& table):
+        self.sp_table = table
+        self.table = table.get()
+
+    @staticmethod
+    def from_pandas(df, name=None):
+        pass
+
+    @staticmethod
+    def from_arrays(names, arrays, name=None):
+        cdef:
+            Array arr
+            Table result
+            c_string c_name
+            vector[shared_ptr[CField]] fields
+            vector[shared_ptr[CColumn]] columns
+            shared_ptr[CSchema] schema
+            shared_ptr[CTable] table
+
+        cdef int K = len(arrays)
+
+        fields.resize(K)
+        columns.resize(K)
+        for i in range(K):
+            arr = arrays[i]
+            c_name = tobytes(names[i])
+
+            fields[i].reset(new CField(c_name, arr.type.sp_type, True))
+            columns[i].reset(new CColumn(fields[i], arr.sp_array))
+
+        if name is None:
+            c_name = ''
+        else:
+            c_name = tobytes(name)
+
+        schema.reset(new CSchema(fields))
+        table.reset(new CTable(c_name, schema, columns))
+
+        result = Table()
+        result.init(table)
+
+        return result
+
+    def to_pandas(self):
+        """
+        Convert the arrow::Table to a pandas DataFrame
+        """
+        cdef:
+            PyObject* arr
+            shared_ptr[CColumn] col
+
+        import pandas as pd
+
+        names = []
+        data = []
+        for i in range(self.table.num_columns()):
+            col = self.table.column(i)
+            check_status(pyarrow.ArrowToPandas(col, &arr))
+            names.append(frombytes(col.get().name()))
+            data.append(<object> arr)
+
+            # One ref count too many
+            Py_XDECREF(arr)
+
+        return pd.DataFrame(dict(zip(names, data)), columns=names)

http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/pyarrow/config.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/config.pyx b/python/pyarrow/config.pyx
index 521bc06..1047a47 100644
--- a/python/pyarrow/config.pyx
+++ b/python/pyarrow/config.pyx
@@ -2,7 +2,18 @@
 # distutils: language = c++
 # cython: embedsignature = True
 
-cdef extern from 'pyarrow/init.h' namespace 'pyarrow':
+cdef extern from 'pyarrow/do_import_numpy.h':
+    pass
+
+cdef extern from 'pyarrow/numpy_interop.h' namespace 'pyarrow':
+    int import_numpy()
+
+cdef extern from 'pyarrow/config.h' namespace 'pyarrow':
     void pyarrow_init()
+    void pyarrow_set_numpy_nan(object o)
 
+import_numpy()
 pyarrow_init()
+
+import numpy as np
+pyarrow_set_numpy_nan(np.nan)

http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/pyarrow/includes/common.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/common.pxd b/python/pyarrow/includes/common.pxd
index 839427a..e86d5d7 100644
--- a/python/pyarrow/includes/common.pxd
+++ b/python/pyarrow/includes/common.pxd
@@ -22,10 +22,16 @@ from libcpp cimport bool as c_bool
 from libcpp.string cimport string as c_string
 from libcpp.vector cimport vector
 
+from cpython cimport PyObject
+cimport cpython
+
 # This must be included for cerr and other things to work
 cdef extern from "<iostream>":
     pass
 
+cdef extern from "<Python.h>":
+    void Py_XDECREF(PyObject* o)
+
 cdef extern from "<memory>" namespace "std" nogil:
 
     cdef cppclass shared_ptr[T]:

http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 943a08f..42f1f25 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -20,6 +20,25 @@
 from pyarrow.includes.common cimport *
 
 cdef extern from "arrow/api.h" namespace "arrow" nogil:
+    # We can later add more of the common status factory methods as needed
+    cdef CStatus CStatus_OK "Status::OK"()
+
+    cdef cppclass CStatus "arrow::Status":
+        CStatus()
+
+        c_string ToString()
+
+        c_bool ok()
+        c_bool IsOutOfMemory()
+        c_bool IsKeyError()
+        c_bool IsNotImplemented()
+        c_bool IsInvalid()
+
+    cdef cppclass Buffer:
+        uint8_t* data()
+        int64_t size()
+
+cdef extern from "arrow/api.h" namespace "arrow" nogil:
 
     enum Type" arrow::Type::type":
         Type_NA" arrow::Type::NA"
@@ -129,25 +148,30 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
     cdef cppclass CStringArray" arrow::StringArray"(CListArray):
         c_string GetString(int i)
 
+    cdef cppclass CChunkedArray" arrow::ChunkedArray":
+        pass
 
-cdef extern from "arrow/api.h" namespace "arrow" nogil:
-    # We can later add more of the common status factory methods as needed
-    cdef CStatus CStatus_OK "Status::OK"()
+    cdef cppclass CColumn" arrow::Column":
+        CColumn(const shared_ptr[CField]& field,
+                const shared_ptr[CArray]& data)
 
-    cdef cppclass CStatus "arrow::Status":
-        CStatus()
+        int64_t length()
+        int64_t null_count()
+        const c_string& name()
+        const shared_ptr[CDataType]& type()
+        const shared_ptr[CChunkedArray]& data()
 
-        c_string ToString()
+    cdef cppclass CTable" arrow::Table":
+        CTable(const c_string& name, const shared_ptr[CSchema]& schema,
+               const vector[shared_ptr[CColumn]]& columns)
 
-        c_bool ok()
-        c_bool IsOutOfMemory()
-        c_bool IsKeyError()
-        c_bool IsNotImplemented()
-        c_bool IsInvalid()
+        int num_columns()
+        int num_rows()
 
-    cdef cppclass Buffer:
-        uint8_t* data()
-        int64_t size()
+        const c_string& name()
+
+        const shared_ptr[CSchema]& schema()
+        const shared_ptr[CColumn]& column(int i)
 
 
 cdef extern from "arrow/ipc/metadata.h" namespace "arrow::ipc" nogil:

http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/pyarrow/includes/pyarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd
index eedfc85..1066b80 100644
--- a/python/pyarrow/includes/pyarrow.pxd
+++ b/python/pyarrow/includes/pyarrow.pxd
@@ -18,7 +18,8 @@
 # distutils: language = c++
 
 from pyarrow.includes.common cimport *
-from pyarrow.includes.libarrow cimport CArray, CDataType, Type, MemoryPool
+from pyarrow.includes.libarrow cimport (CArray, CColumn, CDataType,
+                                        Type, MemoryPool)
 
 cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil:
     # We can later add more of the common status factory methods as needed
@@ -41,4 +42,10 @@ cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil:
     shared_ptr[CDataType] GetPrimitiveType(Type type)
     Status ConvertPySequence(object obj, shared_ptr[CArray]* out)
 
+    Status PandasToArrow(MemoryPool* pool, object ao, shared_ptr[CArray]* out)
+    Status PandasMaskedToArrow(MemoryPool* pool, object ao, object mo,
+                               shared_ptr[CArray]* out)
+
+    Status ArrowToPandas(const shared_ptr[CColumn]& arr, PyObject** out)
+
     MemoryPool* GetMemoryPool()

http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
new file mode 100644
index 0000000..6dc9c68
--- /dev/null
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -0,0 +1,172 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import unittest
+
+import numpy as np
+
+import pandas as pd
+import pandas.util.testing as tm
+
+import pyarrow as A
+
+
+class TestPandasConversion(unittest.TestCase):
+
+    def setUp(self):
+        pass
+
+    def tearDown(self):
+        pass
+
+    def _check_pandas_roundtrip(self, df, expected=None):
+        table = A.from_pandas_dataframe(df)
+        result = table.to_pandas()
+        if expected is None:
+            expected = df
+        tm.assert_frame_equal(result, expected)
+
+    def test_float_no_nulls(self):
+        data = {}
+        numpy_dtypes = ['f4', 'f8']
+        num_values = 100
+
+        for dtype in numpy_dtypes:
+            values = np.random.randn(num_values)
+            data[dtype] = values.astype(dtype)
+
+        df = pd.DataFrame(data)
+        self._check_pandas_roundtrip(df)
+
+    def test_float_nulls(self):
+        num_values = 100
+
+        null_mask = np.random.randint(0, 10, size=num_values) < 3
+        dtypes = ['f4', 'f8']
+        expected_cols = []
+
+        arrays = []
+        for name in dtypes:
+            values = np.random.randn(num_values).astype(name)
+
+            arr = A.from_pandas_series(values, null_mask)
+            arrays.append(arr)
+
+            values[null_mask] = np.nan
+
+            expected_cols.append(values)
+
+        ex_frame = pd.DataFrame(dict(zip(dtypes, expected_cols)),
+                                columns=dtypes)
+
+        table = A.Table.from_arrays(dtypes, arrays)
+        result = table.to_pandas()
+        tm.assert_frame_equal(result, ex_frame)
+
+    def test_integer_no_nulls(self):
+        data = {}
+
+        numpy_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8']
+        num_values = 100
+
+        for dtype in numpy_dtypes:
+            info = np.iinfo(dtype)
+            values = np.random.randint(info.min,
+                                       min(info.max, np.iinfo('i8').max),
+                                       size=num_values)
+            data[dtype] = values.astype(dtype)
+
+        df = pd.DataFrame(data)
+        self._check_pandas_roundtrip(df)
+
+    def test_integer_with_nulls(self):
+        # pandas requires upcast to float dtype
+
+        int_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8']
+        num_values = 100
+
+        null_mask = np.random.randint(0, 10, size=num_values) < 3
+
+        expected_cols = []
+        arrays = []
+        for name in int_dtypes:
+            values = np.random.randint(0, 100, size=num_values)
+
+            arr = A.from_pandas_series(values, null_mask)
+            arrays.append(arr)
+
+            expected = values.astype('f8')
+            expected[null_mask] = np.nan
+
+            expected_cols.append(expected)
+
+        ex_frame = pd.DataFrame(dict(zip(int_dtypes, expected_cols)),
+                                columns=int_dtypes)
+
+        table = A.Table.from_arrays(int_dtypes, arrays)
+        result = table.to_pandas()
+
+        tm.assert_frame_equal(result, ex_frame)
+
+    def test_boolean_no_nulls(self):
+        num_values = 100
+
+        np.random.seed(0)
+
+        df = pd.DataFrame({'bools': np.random.randn(num_values) > 0})
+        self._check_pandas_roundtrip(df)
+
+    def test_boolean_nulls(self):
+        # pandas requires upcast to object dtype
+        num_values = 100
+        np.random.seed(0)
+
+        mask = np.random.randint(0, 10, size=num_values) < 3
+        values = np.random.randint(0, 10, size=num_values) < 5
+
+        arr = A.from_pandas_series(values, mask)
+
+        expected = values.astype(object)
+        expected[mask] = None
+
+        ex_frame = pd.DataFrame({'bools': expected})
+
+        table = A.Table.from_arrays(['bools'], [arr])
+        result = table.to_pandas()
+
+        tm.assert_frame_equal(result, ex_frame)
+
+    def test_boolean_object_nulls(self):
+        arr = np.array([False, None, True] * 100, dtype=object)
+        df = pd.DataFrame({'bools': arr})
+        self._check_pandas_roundtrip(df)
+
+    def test_strings(self):
+        repeats = 1000
+        values = [b'foo', None, u'bar', 'qux', np.nan]
+        df = pd.DataFrame({'strings': values * repeats})
+
+        values = ['foo', None, u'bar', 'qux', None]
+        expected = pd.DataFrame({'strings': values * repeats})
+        self._check_pandas_roundtrip(df, expected)
+
+    # def test_category(self):
+    #     repeats = 1000
+    #     values = [b'foo', None, u'bar', 'qux', np.nan]
+    #     df = pd.DataFrame({'strings': values * repeats})
+    #     df['strings'] = df['strings'].astype('category')
+    #     self._check_pandas_roundtrip(df)

http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/src/pyarrow/adapters/pandas.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc
new file mode 100644
index 0000000..22f1d75
--- /dev/null
+++ b/python/src/pyarrow/adapters/pandas.cc
@@ -0,0 +1,714 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Functions for pandas conversion via NumPy
+
+#include <Python.h>
+
+#include "pyarrow/numpy_interop.h"
+
+#include <cmath>
+#include <cstdint>
+#include <memory>
+#include <sstream>
+#include <string>
+
+#include "arrow/api.h"
+#include "arrow/util/bit-util.h"
+
+#include "pyarrow/common.h"
+#include "pyarrow/config.h"
+#include "pyarrow/status.h"
+
+namespace pyarrow {
+
+using arrow::Array;
+using arrow::Column;
+namespace util = arrow::util;
+
+// ----------------------------------------------------------------------
+// Serialization
+
+template <int TYPE>
+struct npy_traits {
+};
+
+template <>
+struct npy_traits<NPY_BOOL> {
+  typedef uint8_t value_type;
+  using ArrayType = arrow::BooleanArray;
+
+  static constexpr bool supports_nulls = false;
+  static inline bool isnull(uint8_t v) {
+    return false;
+  }
+};
+
+#define NPY_INT_DECL(TYPE, CapType, T)              \
+  template <>                                       \
+  struct npy_traits<NPY_##TYPE> {                   \
+    typedef T value_type;                           \
+    using ArrayType = arrow::CapType##Array;        \
+                                                    \
+    static constexpr bool supports_nulls = false;   \
+    static inline bool isnull(T v) {                \
+      return false;                                 \
+    }                                               \
+  };
+
+NPY_INT_DECL(INT8, Int8, int8_t);
+NPY_INT_DECL(INT16, Int16, int16_t);
+NPY_INT_DECL(INT32, Int32, int32_t);
+NPY_INT_DECL(INT64, Int64, int64_t);
+NPY_INT_DECL(UINT8, UInt8, uint8_t);
+NPY_INT_DECL(UINT16, UInt16, uint16_t);
+NPY_INT_DECL(UINT32, UInt32, uint32_t);
+NPY_INT_DECL(UINT64, UInt64, uint64_t);
+
+template <>
+struct npy_traits<NPY_FLOAT32> {
+  typedef float value_type;
+  using ArrayType = arrow::FloatArray;
+
+  static constexpr bool supports_nulls = true;
+
+  static inline bool isnull(float v) {
+    return v != v;
+  }
+};
+
+template <>
+struct npy_traits<NPY_FLOAT64> {
+  typedef double value_type;
+  using ArrayType = arrow::DoubleArray;
+
+  static constexpr bool supports_nulls = true;
+
+  static inline bool isnull(double v) {
+    return v != v;
+  }
+};
+
+template <>
+struct npy_traits<NPY_OBJECT> {
+  typedef PyObject* value_type;
+  static constexpr bool supports_nulls = true;
+};
+
+template <int TYPE>
+class ArrowSerializer {
+ public:
+  ArrowSerializer(arrow::MemoryPool* pool, PyArrayObject* arr, PyArrayObject* mask) :
+      pool_(pool),
+      arr_(arr),
+      mask_(mask) {
+    length_ = PyArray_SIZE(arr_);
+  }
+
+  Status Convert(std::shared_ptr<Array>* out);
+
+  int stride() const {
+    return PyArray_STRIDES(arr_)[0];
+  }
+
+  Status InitNullBitmap() {
+    int null_bytes = util::bytes_for_bits(length_);
+
+    null_bitmap_ = std::make_shared<arrow::PoolBuffer>(pool_);
+    RETURN_ARROW_NOT_OK(null_bitmap_->Resize(null_bytes));
+
+    null_bitmap_data_ = null_bitmap_->mutable_data();
+    memset(null_bitmap_data_, 0, null_bytes);
+
+    return Status::OK();
+  }
+
+  bool is_strided() const {
+    npy_intp* astrides = PyArray_STRIDES(arr_);
+    return astrides[0] != PyArray_DESCR(arr_)->elsize;
+  }
+
+ private:
+  Status ConvertData();
+
+  Status ConvertObjectStrings(std::shared_ptr<Array>* out) {
+    PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
+
+    auto offsets_buffer = std::make_shared<arrow::PoolBuffer>(pool_);
+    RETURN_ARROW_NOT_OK(offsets_buffer->Resize(sizeof(int32_t) * (length_ + 1)));
+    int32_t* offsets = reinterpret_cast<int32_t*>(offsets_buffer->mutable_data());
+
+    arrow::BufferBuilder data_builder(pool_);
+    arrow::Status s;
+    PyObject* obj;
+    int length;
+    int offset = 0;
+    int64_t null_count = 0;
+    for (int64_t i = 0; i < length_; ++i) {
+      obj = objects[i];
+      if (PyUnicode_Check(obj)) {
+        obj = PyUnicode_AsUTF8String(obj);
+        if (obj == NULL) {
+          PyErr_Clear();
+          return Status::TypeError("failed converting unicode to UTF8");
+        }
+        length = PyBytes_GET_SIZE(obj);
+        s = data_builder.Append(
+            reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(obj)), length);
+        Py_DECREF(obj);
+        if (!s.ok()) {
+          return Status::ArrowError(s.ToString());
+        }
+        util::set_bit(null_bitmap_data_, i);
+      } else if (PyBytes_Check(obj)) {
+        length = PyBytes_GET_SIZE(obj);
+        RETURN_ARROW_NOT_OK(data_builder.Append(
+                reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(obj)), length));
+        util::set_bit(null_bitmap_data_, i);
+      } else {
+        // NULL
+        // No change to offset
+        length = 0;
+        ++null_count;
+      }
+      offsets[i] = offset;
+      offset += length;
+    }
+    // End offset
+    offsets[length_] = offset;
+
+    std::shared_ptr<arrow::Buffer> data_buffer = data_builder.Finish();
+
+    auto values = std::make_shared<arrow::UInt8Array>(data_buffer->size(),
+        data_buffer);
+    *out = std::shared_ptr<arrow::Array>(
+        new arrow::StringArray(length_, offsets_buffer, values, null_count,
+            null_bitmap_));
+
+    return Status::OK();
+  }
+
+  Status ConvertBooleans(std::shared_ptr<Array>* out) {
+    PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
+
+    int nbytes = util::bytes_for_bits(length_);
+    auto data = std::make_shared<arrow::PoolBuffer>(pool_);
+    RETURN_ARROW_NOT_OK(data->Resize(nbytes));
+    uint8_t* bitmap = data->mutable_data();
+    memset(bitmap, 0, nbytes);
+
+    int64_t null_count = 0;
+    for (int64_t i = 0; i < length_; ++i) {
+      if (objects[i] == Py_True) {
+        util::set_bit(bitmap, i);
+        util::set_bit(null_bitmap_data_, i);
+      } else if (objects[i] != Py_False) {
+        ++null_count;
+      } else {
+        util::set_bit(null_bitmap_data_, i);
+      }
+    }
+
+    *out = std::make_shared<arrow::BooleanArray>(length_, data, null_count,
+        null_bitmap_);
+
+    return Status::OK();
+  }
+
+  arrow::MemoryPool* pool_;
+
+  PyArrayObject* arr_;
+  PyArrayObject* mask_;
+
+  int64_t length_;
+
+  std::shared_ptr<arrow::Buffer> data_;
+  std::shared_ptr<arrow::ResizableBuffer> null_bitmap_;
+  uint8_t* null_bitmap_data_;
+};
+
+// Returns null count
+static int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) {
+  int64_t null_count = 0;
+  const uint8_t* mask_values = static_cast<const uint8_t*>(PyArray_DATA(mask));
+  // TODO(wesm): strided null mask
+  for (int i = 0; i < length; ++i) {
+    if (mask_values[i]) {
+      ++null_count;
+    } else {
+      util::set_bit(bitmap, i);
+    }
+  }
+  return null_count;
+}
+
+template <int TYPE>
+static int64_t ValuesToBitmap(const void* data, int64_t length, uint8_t* bitmap) {
+  typedef npy_traits<TYPE> traits;
+  typedef typename traits::value_type T;
+
+  int64_t null_count = 0;
+  const T* values = reinterpret_cast<const T*>(data);
+
+  // TODO(wesm): striding
+  for (int i = 0; i < length; ++i) {
+    if (traits::isnull(values[i])) {
+      ++null_count;
+    } else {
+      util::set_bit(bitmap, i);
+    }
+  }
+
+  return null_count;
+}
+
+template <int TYPE>
+inline Status ArrowSerializer<TYPE>::Convert(std::shared_ptr<Array>* out) {
+  typedef npy_traits<TYPE> traits;
+
+  if (mask_ != nullptr || traits::supports_nulls) {
+    RETURN_NOT_OK(InitNullBitmap());
+  }
+
+  int64_t null_count = 0;
+  if (mask_ != nullptr) {
+    null_count = MaskToBitmap(mask_, length_, null_bitmap_data_);
+  } else if (traits::supports_nulls) {
+    null_count = ValuesToBitmap<TYPE>(PyArray_DATA(arr_), length_, null_bitmap_data_);
+  }
+
+  RETURN_NOT_OK(ConvertData());
+  *out = std::make_shared<typename traits::ArrayType>(length_, data_, null_count,
+      null_bitmap_);
+
+  return Status::OK();
+}
+
+static inline bool PyObject_is_null(const PyObject* obj) {
+  return obj == Py_None || obj == numpy_nan;
+}
+
+static inline bool PyObject_is_string(const PyObject* obj) {
+#if PY_MAJOR_VERSION >= 3
+  return PyUnicode_Check(obj) || PyBytes_Check(obj);
+#else
+  return PyString_Check(obj) || PyUnicode_Check(obj);
+#endif
+}
+
+static inline bool PyObject_is_bool(const PyObject* obj) {
+#if PY_MAJOR_VERSION >= 3
+  return PyString_Check(obj) || PyBytes_Check(obj);
+#else
+  return PyString_Check(obj) || PyUnicode_Check(obj);
+#endif
+}
+
+template <>
+inline Status ArrowSerializer<NPY_OBJECT>::Convert(std::shared_ptr<Array>* out) {
+  // Python object arrays are annoying, since we could have one of:
+  //
+  // * Strings
+  // * Booleans with nulls
+  // * Mixed type (not supported at the moment by arrow format)
+  //
+  // Additionally, nulls may be encoded either as np.nan or None. So we have to
+  // do some type inference and conversion
+
+  RETURN_NOT_OK(InitNullBitmap());
+
+  // TODO: mask not supported here
+  const PyObject** objects = reinterpret_cast<const PyObject**>(PyArray_DATA(arr_));
+
+  for (int64_t i = 0; i < length_; ++i) {
+    if (PyObject_is_null(objects[i])) {
+      continue;
+    } else if (PyObject_is_string(objects[i])) {
+      return ConvertObjectStrings(out);
+    } else if (PyBool_Check(objects[i])) {
+      return ConvertBooleans(out);
+    } else {
+      return Status::TypeError("unhandled python type");
+    }
+  }
+
+  return Status::TypeError("Unable to infer type of object array, were all null");
+}
+
+template <int TYPE>
+inline Status ArrowSerializer<TYPE>::ConvertData() {
+  // TODO(wesm): strided arrays
+  if (is_strided()) {
+    return Status::ValueError("no support for strided data yet");
+  }
+
+  data_ = std::make_shared<NumPyBuffer>(arr_);
+  return Status::OK();
+}
+
+template <>
+inline Status ArrowSerializer<NPY_BOOL>::ConvertData() {
+  if (is_strided()) {
+    return Status::ValueError("no support for strided data yet");
+  }
+
+  int nbytes = util::bytes_for_bits(length_);
+  auto buffer = std::make_shared<arrow::PoolBuffer>(pool_);
+  RETURN_ARROW_NOT_OK(buffer->Resize(nbytes));
+
+  const uint8_t* values = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));
+
+  uint8_t* bitmap = buffer->mutable_data();
+
+  memset(bitmap, 0, nbytes);
+  for (int i = 0; i < length_; ++i) {
+    if (values[i] > 0) {
+      util::set_bit(bitmap, i);
+    }
+  }
+
+  data_ = buffer;
+
+  return Status::OK();
+}
+
+template <>
+inline Status ArrowSerializer<NPY_OBJECT>::ConvertData() {
+  return Status::TypeError("NYI");
+}
+
+
+#define TO_ARROW_CASE(TYPE)                                     \
+  case NPY_##TYPE:                                              \
+    {                                                           \
+      ArrowSerializer<NPY_##TYPE> converter(pool, arr, mask);   \
+      RETURN_NOT_OK(converter.Convert(out));                    \
+    }                                                           \
+    break;
+
+Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo,
+    std::shared_ptr<Array>* out) {
+  PyArrayObject* arr = reinterpret_cast<PyArrayObject*>(ao);
+  PyArrayObject* mask = nullptr;
+
+  if (mo != nullptr) {
+    mask = reinterpret_cast<PyArrayObject*>(mo);
+  }
+
+  if (PyArray_NDIM(arr) != 1) {
+    return Status::ValueError("only handle 1-dimensional arrays");
+  }
+
+  switch(PyArray_DESCR(arr)->type_num) {
+    TO_ARROW_CASE(BOOL);
+    TO_ARROW_CASE(INT8);
+    TO_ARROW_CASE(INT16);
+    TO_ARROW_CASE(INT32);
+    TO_ARROW_CASE(INT64);
+    TO_ARROW_CASE(UINT8);
+    TO_ARROW_CASE(UINT16);
+    TO_ARROW_CASE(UINT32);
+    TO_ARROW_CASE(UINT64);
+    TO_ARROW_CASE(FLOAT32);
+    TO_ARROW_CASE(FLOAT64);
+    TO_ARROW_CASE(OBJECT);
+    default:
+      std::stringstream ss;
+      ss << "unsupported type " << PyArray_DESCR(arr)->type_num
+         << std::endl;
+      return Status::NotImplemented(ss.str());
+  }
+  return Status::OK();
+}
+
+Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao,
+    std::shared_ptr<Array>* out) {
+  return PandasMaskedToArrow(pool, ao, nullptr, out);
+}
+
+// ----------------------------------------------------------------------
+// Deserialization
+
+template <int TYPE>
+struct arrow_traits {
+};
+
+template <>
+struct arrow_traits<arrow::Type::BOOL> {
+  static constexpr int npy_type = NPY_BOOL;
+  static constexpr bool supports_nulls = false;
+  static constexpr bool is_boolean = true;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_floating = false;
+};
+
+#define INT_DECL(TYPE)                                      \
+  template <>                                               \
+  struct arrow_traits<arrow::Type::TYPE> {              \
+    static constexpr int npy_type = NPY_##TYPE;             \
+    static constexpr bool supports_nulls = false;           \
+    static constexpr double na_value = NAN;                 \
+    static constexpr bool is_boolean = false;               \
+    static constexpr bool is_integer = true;                \
+    static constexpr bool is_floating = false;              \
+    typedef typename npy_traits<NPY_##TYPE>::value_type T;  \
+  };
+
+INT_DECL(INT8);
+INT_DECL(INT16);
+INT_DECL(INT32);
+INT_DECL(INT64);
+INT_DECL(UINT8);
+INT_DECL(UINT16);
+INT_DECL(UINT32);
+INT_DECL(UINT64);
+
+template <>
+struct arrow_traits<arrow::Type::FLOAT> {
+  static constexpr int npy_type = NPY_FLOAT32;
+  static constexpr bool supports_nulls = true;
+  static constexpr float na_value = NAN;
+  static constexpr bool is_boolean = false;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_floating = true;
+  typedef typename npy_traits<NPY_FLOAT32>::value_type T;
+};
+
+template <>
+struct arrow_traits<arrow::Type::DOUBLE> {
+  static constexpr int npy_type = NPY_FLOAT64;
+  static constexpr bool supports_nulls = true;
+  static constexpr double na_value = NAN;
+  static constexpr bool is_boolean = false;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_floating = true;
+  typedef typename npy_traits<NPY_FLOAT64>::value_type T;
+};
+
+template <>
+struct arrow_traits<arrow::Type::STRING> {
+  static constexpr int npy_type = NPY_OBJECT;
+  static constexpr bool supports_nulls = true;
+  static constexpr bool is_boolean = false;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_floating = false;
+};
+
+
+static inline PyObject* make_pystring(const uint8_t* data, int32_t length) {
+#if PY_MAJOR_VERSION >= 3
+  return PyUnicode_FromStringAndSize(reinterpret_cast<const char*>(data), length);
+#else
+  return PyString_FromStringAndSize(reinterpret_cast<const char*>(data), length);
+#endif
+}
+
+template <int TYPE>
+class ArrowDeserializer {
+ public:
+  ArrowDeserializer(const std::shared_ptr<Column>& col) :
+      col_(col) {}
+
+  Status Convert(PyObject** out) {
+    const std::shared_ptr<arrow::ChunkedArray> data = col_->data();
+    if (data->num_chunks() > 1) {
+      return Status::NotImplemented("Chunked column conversion NYI");
+    }
+
+    auto chunk = data->chunk(0);
+
+    RETURN_NOT_OK(ConvertValues<TYPE>(chunk));
+    *out = reinterpret_cast<PyObject*>(out_);
+    return Status::OK();
+  }
+
+  Status AllocateOutput(int type) {
+    npy_intp dims[1] = {col_->length()};
+    out_ = reinterpret_cast<PyArrayObject*>(PyArray_SimpleNew(1, dims, type));
+
+    if (out_ == NULL) {
+      // Error occurred, trust that SimpleNew set the error state
+      return Status::OK();
+    }
+
+    return Status::OK();
+  }
+
+  template <int T2>
+  inline typename std::enable_if<
+    arrow_traits<T2>::is_floating, Status>::type
+  ConvertValues(const std::shared_ptr<Array>& arr) {
+    typedef typename arrow_traits<T2>::T T;
+
+    arrow::PrimitiveArray* prim_arr = static_cast<arrow::PrimitiveArray*>(
+        arr.get());
+
+    RETURN_NOT_OK(AllocateOutput(arrow_traits<T2>::npy_type));
+
+    if (arr->null_count() > 0) {
+      T* out_values = reinterpret_cast<T*>(PyArray_DATA(out_));
+      const T* in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
+      for (int64_t i = 0; i < arr->length(); ++i) {
+        out_values[i] = arr->IsNull(i) ? NAN : in_values[i];
+      }
+    } else {
+      memcpy(PyArray_DATA(out_), prim_arr->data()->data(),
+          arr->length() * arr->type()->value_size());
+    }
+
+    return Status::OK();
+  }
+
+  // Integer specialization
+  template <int T2>
+  inline typename std::enable_if<
+    arrow_traits<T2>::is_integer, Status>::type
+  ConvertValues(const std::shared_ptr<Array>& arr) {
+    typedef typename arrow_traits<T2>::T T;
+
+    arrow::PrimitiveArray* prim_arr = static_cast<arrow::PrimitiveArray*>(
+        arr.get());
+
+    const T* in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
+
+    if (arr->null_count() > 0) {
+      RETURN_NOT_OK(AllocateOutput(NPY_FLOAT64));
+
+      // Upcast to double, set NaN as appropriate
+      double* out_values = reinterpret_cast<double*>(PyArray_DATA(out_));
+      for (int i = 0; i < arr->length(); ++i) {
+        out_values[i] = prim_arr->IsNull(i) ? NAN : in_values[i];
+      }
+    } else {
+      RETURN_NOT_OK(AllocateOutput(arrow_traits<TYPE>::npy_type));
+
+      memcpy(PyArray_DATA(out_), in_values,
+          arr->length() * arr->type()->value_size());
+    }
+
+    return Status::OK();
+  }
+
+  // Boolean specialization
+  template <int T2>
+  inline typename std::enable_if<
+    arrow_traits<T2>::is_boolean, Status>::type
+  ConvertValues(const std::shared_ptr<Array>& arr) {
+    arrow::BooleanArray* bool_arr = static_cast<arrow::BooleanArray*>(arr.get());
+
+    if (arr->null_count() > 0) {
+      RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
+
+      PyObject** out_values = reinterpret_cast<PyObject**>(PyArray_DATA(out_));
+      for (int64_t i = 0; i < arr->length(); ++i) {
+        if (bool_arr->IsNull(i)) {
+          Py_INCREF(Py_None);
+          out_values[i] = Py_None;
+        } else if (bool_arr->Value(i)) {
+          // True
+          Py_INCREF(Py_True);
+          out_values[i] = Py_True;
+        } else {
+          // False
+          Py_INCREF(Py_False);
+          out_values[i] = Py_False;
+        }
+      }
+    } else {
+      RETURN_NOT_OK(AllocateOutput(arrow_traits<TYPE>::npy_type));
+
+      uint8_t* out_values = reinterpret_cast<uint8_t*>(PyArray_DATA(out_));
+      for (int64_t i = 0; i < arr->length(); ++i) {
+        out_values[i] = static_cast<uint8_t>(bool_arr->Value(i));
+      }
+    }
+
+    return Status::OK();
+  }
+
+  // UTF8
+  template <int T2>
+  inline typename std::enable_if<
+    T2 == arrow::Type::STRING, Status>::type
+  ConvertValues(const std::shared_ptr<Array>& arr) {
+    RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
+
+    PyObject** out_values = reinterpret_cast<PyObject**>(PyArray_DATA(out_));
+
+    arrow::StringArray* string_arr = static_cast<arrow::StringArray*>(arr.get());
+
+    const uint8_t* data;
+    int32_t length;
+    if (arr->null_count() > 0) {
+      for (int64_t i = 0; i < arr->length(); ++i) {
+        if (string_arr->IsNull(i)) {
+          Py_INCREF(Py_None);
+          out_values[i] = Py_None;
+        } else {
+          data = string_arr->GetValue(i, &length);
+
+          out_values[i] = make_pystring(data, length);
+          if (out_values[i] == nullptr) {
+            return Status::OK();
+          }
+        }
+      }
+    } else {
+      for (int64_t i = 0; i < arr->length(); ++i) {
+        data = string_arr->GetValue(i, &length);
+        out_values[i] = make_pystring(data, length);
+        if (out_values[i] == nullptr) {
+          return Status::OK();
+        }
+      }
+    }
+    return Status::OK();
+  }
+ private:
+  std::shared_ptr<Column> col_;
+  PyArrayObject* out_;
+};
+
+#define FROM_ARROW_CASE(TYPE)                               \
+  case arrow::Type::TYPE:                                   \
+    {                                                       \
+      ArrowDeserializer<arrow::Type::TYPE> converter(col);  \
+      return converter.Convert(out);                        \
+    }                                                       \
+    break;
+
+Status ArrowToPandas(const std::shared_ptr<Column>& col, PyObject** out) {
+  switch(col->type()->type) {
+    FROM_ARROW_CASE(BOOL);
+    FROM_ARROW_CASE(INT8);
+    FROM_ARROW_CASE(INT16);
+    FROM_ARROW_CASE(INT32);
+    FROM_ARROW_CASE(INT64);
+    FROM_ARROW_CASE(UINT8);
+    FROM_ARROW_CASE(UINT16);
+    FROM_ARROW_CASE(UINT32);
+    FROM_ARROW_CASE(UINT64);
+    FROM_ARROW_CASE(FLOAT);
+    FROM_ARROW_CASE(DOUBLE);
+    FROM_ARROW_CASE(STRING);
+    default:
+      return Status::NotImplemented("Arrow type reading not implemented");
+  }
+  return Status::OK();
+}
+
+} // namespace pyarrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/src/pyarrow/adapters/pandas.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/pandas.h b/python/src/pyarrow/adapters/pandas.h
index a4f4163..58eb3ca 100644
--- a/python/src/pyarrow/adapters/pandas.h
+++ b/python/src/pyarrow/adapters/pandas.h
@@ -21,8 +21,29 @@
 #ifndef PYARROW_ADAPTERS_PANDAS_H
 #define PYARROW_ADAPTERS_PANDAS_H
 
+#include <Python.h>
+
+#include <memory>
+
+namespace arrow {
+
+class Array;
+class Column;
+
+} // namespace arrow
+
 namespace pyarrow {
 
+class Status;
+
+Status ArrowToPandas(const std::shared_ptr<arrow::Column>& col, PyObject** out);
+
+Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo,
+    std::shared_ptr<arrow::Array>* out);
+
+Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao,
+    std::shared_ptr<arrow::Array>* out);
+
 } // namespace pyarrow
 
 #endif // PYARROW_ADAPTERS_PANDAS_H

http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/src/pyarrow/common.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/common.h b/python/src/pyarrow/common.h
index db63613..cc9ad9e 100644
--- a/python/src/pyarrow/common.h
+++ b/python/src/pyarrow/common.h
@@ -18,7 +18,9 @@
 #ifndef PYARROW_COMMON_H
 #define PYARROW_COMMON_H
 
-#include <Python.h>
+#include "pyarrow/config.h"
+
+#include "arrow/util/buffer.h"
 
 namespace arrow { class MemoryPool; }
 
@@ -90,6 +92,25 @@ struct PyObjectStringify {
 
 arrow::MemoryPool* GetMemoryPool();
 
+class NumPyBuffer : public arrow::Buffer {
+ public:
+  NumPyBuffer(PyArrayObject* arr) :
+      Buffer(nullptr, 0) {
+    arr_ = arr;
+    Py_INCREF(arr);
+
+    data_ = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));
+    size_ = PyArray_SIZE(arr_);
+  }
+
+  virtual ~NumPyBuffer() {
+    Py_XDECREF(arr_);
+  }
+
+ private:
+  PyArrayObject* arr_;
+};
+
 } // namespace pyarrow
 
 #endif // PYARROW_COMMON_H

http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/src/pyarrow/config.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/config.cc b/python/src/pyarrow/config.cc
new file mode 100644
index 0000000..730d2db
--- /dev/null
+++ b/python/src/pyarrow/config.cc
@@ -0,0 +1,34 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <Python.h>
+
+#include "pyarrow/config.h"
+
+namespace pyarrow {
+
+void pyarrow_init() {
+}
+
+PyObject* numpy_nan = nullptr;
+
+void pyarrow_set_numpy_nan(PyObject* obj) {
+  Py_INCREF(obj);
+  numpy_nan = obj;
+}
+
+} // namespace pyarrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/src/pyarrow/config.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/config.h b/python/src/pyarrow/config.h
new file mode 100644
index 0000000..48ae715
--- /dev/null
+++ b/python/src/pyarrow/config.h
@@ -0,0 +1,39 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef PYARROW_CONFIG_H
+#define PYARROW_CONFIG_H
+
+#include <Python.h>
+
+#include "pyarrow/numpy_interop.h"
+
+#if PY_MAJOR_VERSION >= 3
+  #define PyString_Check PyUnicode_Check
+#endif
+
+namespace pyarrow {
+
+extern PyObject* numpy_nan;
+
+void pyarrow_init();
+
+void pyarrow_set_numpy_nan(PyObject* obj);
+
+} // namespace pyarrow
+
+#endif // PYARROW_CONFIG_H

http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/src/pyarrow/do_import_numpy.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/do_import_numpy.h b/python/src/pyarrow/do_import_numpy.h
new file mode 100644
index 0000000..bb4a382
--- /dev/null
+++ b/python/src/pyarrow/do_import_numpy.h
@@ -0,0 +1,21 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Trick borrowed from dynd-python for initializing the NumPy array API
+
+// Trigger the array import (inversion of NO_IMPORT_ARRAY)
+#define NUMPY_IMPORT_ARRAY

http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/src/pyarrow/init.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/init.cc b/python/src/pyarrow/init.cc
deleted file mode 100644
index acd851e..0000000
--- a/python/src/pyarrow/init.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "pyarrow/init.h"
-
-namespace pyarrow {
-
-void pyarrow_init() {
-}
-
-} // namespace pyarrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/src/pyarrow/init.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/init.h b/python/src/pyarrow/init.h
deleted file mode 100644
index 71e67a2..0000000
--- a/python/src/pyarrow/init.h
+++ /dev/null
@@ -1,27 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#ifndef PYARROW_INIT_H
-#define PYARROW_INIT_H
-
-namespace pyarrow {
-
-void pyarrow_init();
-
-} // namespace pyarrow
-
-#endif // PYARROW_INIT_H

http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/src/pyarrow/numpy_interop.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/numpy_interop.h b/python/src/pyarrow/numpy_interop.h
new file mode 100644
index 0000000..882d287
--- /dev/null
+++ b/python/src/pyarrow/numpy_interop.h
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef PYARROW_NUMPY_INTEROP_H
+#define PYARROW_NUMPY_INTEROP_H
+
+#include <Python.h>
+
+#include <numpy/numpyconfig.h>
+
+// Don't use the deprecated Numpy functions
+#ifdef NPY_1_7_API_VERSION
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#else
+#define NPY_ARRAY_NOTSWAPPED NPY_NOTSWAPPED
+#define NPY_ARRAY_ALIGNED NPY_ALIGNED
+#define NPY_ARRAY_WRITEABLE NPY_WRITEABLE
+#define NPY_ARRAY_UPDATEIFCOPY NPY_UPDATEIFCOPY
+#endif
+
+// This is required to be able to access the NumPy C API properly in C++ files
+// other than this main one
+#define PY_ARRAY_UNIQUE_SYMBOL pyarrow_ARRAY_API
+#ifndef NUMPY_IMPORT_ARRAY
+#define NO_IMPORT_ARRAY
+#endif
+
+#include <numpy/arrayobject.h>
+#include <numpy/ufuncobject.h>
+
+namespace pyarrow {
+
+inline int import_numpy() {
+#ifdef NUMPY_IMPORT_ARRAY
+  import_array1(-1);
+  import_umath1(-1);
+#endif
+
+  return 0;
+}
+
+} // namespace pyarrow
+
+#endif // PYARROW_NUMPY_INTEROP_H


Mime
View raw message