arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject [1/2] arrow git commit: ARROW-31: Python: prototype user object model, add PyList conversion path with type inference
Date Mon, 07 Mar 2016 23:02:26 GMT
Repository: arrow
Updated Branches:
  refs/heads/master 571343bbe -> 9afb66778


http://git-wip-us.apache.org/repos/asf/arrow/blob/9afb6677/python/arrow/tests/test_array.py
----------------------------------------------------------------------
diff --git a/python/arrow/tests/test_array.py b/python/arrow/tests/test_array.py
new file mode 100644
index 0000000..8eaa533
--- /dev/null
+++ b/python/arrow/tests/test_array.py
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from arrow.compat import unittest
+import arrow
+
+
+class TestArrayAPI(unittest.TestCase):
+
+    def test_getitem_NA(self):
+        arr = arrow.from_pylist([1, None, 2])
+        assert arr[1] is arrow.NA

http://git-wip-us.apache.org/repos/asf/arrow/blob/9afb6677/python/arrow/tests/test_convert_builtin.py
----------------------------------------------------------------------
diff --git a/python/arrow/tests/test_convert_builtin.py b/python/arrow/tests/test_convert_builtin.py
new file mode 100644
index 0000000..57e6ab9
--- /dev/null
+++ b/python/arrow/tests/test_convert_builtin.py
@@ -0,0 +1,85 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from arrow.compat import unittest
+import arrow
+
+
+class TestConvertList(unittest.TestCase):
+
+    def test_boolean(self):
+        pass
+
+    def test_empty_list(self):
+        arr = arrow.from_pylist([])
+        assert len(arr) == 0
+        assert arr.null_count == 0
+        assert arr.type == arrow.null()
+
+    def test_all_none(self):
+        arr = arrow.from_pylist([None, None])
+        assert len(arr) == 2
+        assert arr.null_count == 2
+        assert arr.type == arrow.null()
+
+    def test_integer(self):
+        arr = arrow.from_pylist([1, None, 3, None])
+        assert len(arr) == 4
+        assert arr.null_count == 2
+        assert arr.type == arrow.int64()
+
+    def test_garbage_collection(self):
+        import gc
+        bytes_before = arrow.total_allocated_bytes()
+        arrow.from_pylist([1, None, 3, None])
+        gc.collect()
+        assert arrow.total_allocated_bytes() == bytes_before
+
+    def test_double(self):
+        data = [1.5, 1, None, 2.5, None, None]
+        arr = arrow.from_pylist(data)
+        assert len(arr) == 6
+        assert arr.null_count == 3
+        assert arr.type == arrow.double()
+
+    def test_string(self):
+        data = ['foo', b'bar', None, 'arrow']
+        arr = arrow.from_pylist(data)
+        assert len(arr) == 4
+        assert arr.null_count == 1
+        assert arr.type == arrow.string()
+
+    def test_mixed_nesting_levels(self):
+        arrow.from_pylist([1, 2, None])
+        arrow.from_pylist([[1], [2], None])
+        arrow.from_pylist([[1], [2], [None]])
+
+        with self.assertRaises(arrow.ArrowException):
+            arrow.from_pylist([1, 2, [1]])
+
+        with self.assertRaises(arrow.ArrowException):
+            arrow.from_pylist([1, 2, []])
+
+        with self.assertRaises(arrow.ArrowException):
+            arrow.from_pylist([[1], [2], [None, [1]]])
+
+    def test_list_of_int(self):
+        data = [[1, 2, 3], [], None, [1, 2]]
+        arr = arrow.from_pylist(data)
+        assert len(arr) == 4
+        assert arr.null_count == 1
+        assert arr.type == arrow.list_(arrow.int64())

http://git-wip-us.apache.org/repos/asf/arrow/blob/9afb6677/python/arrow/tests/test_schema.py
----------------------------------------------------------------------
diff --git a/python/arrow/tests/test_schema.py b/python/arrow/tests/test_schema.py
new file mode 100644
index 0000000..a89edd7
--- /dev/null
+++ b/python/arrow/tests/test_schema.py
@@ -0,0 +1,51 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from arrow.compat import unittest
+import arrow
+
+
+class TestTypes(unittest.TestCase):
+
+    def test_integers(self):
+        dtypes = ['int8', 'int16', 'int32', 'int64',
+                  'uint8', 'uint16', 'uint32', 'uint64']
+
+        for name in dtypes:
+            factory = getattr(arrow, name)
+            t = factory()
+            t_required = factory(False)
+
+            assert str(t) == name
+            assert str(t_required) == '{0} not null'.format(name)
+
+    def test_list(self):
+        value_type = arrow.int32()
+        list_type = arrow.list_(value_type)
+        assert str(list_type) == 'list<int32>'
+
+    def test_string(self):
+        t = arrow.string()
+        assert str(t) == 'string'
+
+    def test_field(self):
+        t = arrow.string()
+        f = arrow.field('foo', t)
+
+        assert f.name == 'foo'
+        assert f.type is t
+        assert repr(f) == "Field('foo', type=string)"

http://git-wip-us.apache.org/repos/asf/arrow/blob/9afb6677/python/setup.py
----------------------------------------------------------------------
diff --git a/python/setup.py b/python/setup.py
index f6b0a4b..9a0de07 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -124,7 +124,10 @@ class build_ext(_build_ext):
                              static_lib_option, source]
 
             self.spawn(cmake_command)
-            self.spawn(['make'])
+            args = ['make']
+            if 'PYARROW_PARALLEL' in os.environ:
+                args.append('-j{0}'.format(os.environ['PYARROW_PARALLEL']))
+            self.spawn(args)
         else:
             import shlex
             cmake_generator = 'Visual Studio 14 2015'
@@ -207,7 +210,7 @@ class build_ext(_build_ext):
             return name + suffix
 
     def get_cmake_cython_names(self):
-        return ['config', 'parquet']
+        return ['array', 'config', 'error', 'parquet', 'scalar', 'schema']
 
     def get_names(self):
         return self._found_names

http://git-wip-us.apache.org/repos/asf/arrow/blob/9afb6677/python/src/pyarrow/adapters/builtin.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc
new file mode 100644
index 0000000..ae84fa1
--- /dev/null
+++ b/python/src/pyarrow/adapters/builtin.cc
@@ -0,0 +1,415 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <Python.h>
+#include <sstream>
+
+#include "pyarrow/adapters/builtin.h"
+
+#include <arrow/api.h>
+
+#include "pyarrow/status.h"
+
+using arrow::ArrayBuilder;
+using arrow::DataType;
+using arrow::LogicalType;
+
+namespace pyarrow {
+
+static inline bool IsPyInteger(PyObject* obj) {
+#if PYARROW_IS_PY2
+  return PyLong_Check(obj) || PyInt_Check(obj);
+#else
+  return PyLong_Check(obj);
+#endif
+}
+
+static inline bool IsPyBaseString(PyObject* obj) {
+#if PYARROW_IS_PY2
+  return PyString_Check(obj) || PyUnicode_Check(obj);
+#else
+  return PyUnicode_Check(obj);
+#endif
+}
+
+class ScalarVisitor {
+ public:
+  ScalarVisitor() :
+      total_count_(0),
+      none_count_(0),
+      bool_count_(0),
+      int_count_(0),
+      float_count_(0),
+      string_count_(0) {}
+
+  void Visit(PyObject* obj) {
+    ++total_count_;
+    if (obj == Py_None) {
+      ++none_count_;
+    } else if (PyFloat_Check(obj)) {
+      ++float_count_;
+    } else if (IsPyInteger(obj)) {
+      ++int_count_;
+    } else if (IsPyBaseString(obj)) {
+      ++string_count_;
+    } else {
+      // TODO(wesm): accumulate error information somewhere
+    }
+  }
+
+  std::shared_ptr<DataType> GetType() {
+    // TODO(wesm): handling mixed-type cases
+    if (float_count_) {
+      return arrow::DOUBLE;
+    } else if (int_count_) {
+      // TODO(wesm): tighter type later
+      return arrow::INT64;
+    } else if (bool_count_) {
+      return arrow::BOOL;
+    } else if (string_count_) {
+      return arrow::STRING;
+    } else {
+      return arrow::NA;
+    }
+  }
+
+  int64_t total_count() const {
+    return total_count_;
+  }
+
+ private:
+  int64_t total_count_;
+  int64_t none_count_;
+  int64_t bool_count_;
+  int64_t int_count_;
+  int64_t float_count_;
+  int64_t string_count_;
+
+  // Place to accumulate errors
+  // std::vector<Status> errors_;
+};
+
+static constexpr int MAX_NESTING_LEVELS = 32;
+
+class SeqVisitor {
+ public:
+  SeqVisitor() :
+      max_nesting_level_(0) {
+    memset(nesting_histogram_, 0, MAX_NESTING_LEVELS * sizeof(int));
+  }
+
+  Status Visit(PyObject* obj, int level=0) {
+    Py_ssize_t size = PySequence_Size(obj);
+
+    if (level > max_nesting_level_) {
+      max_nesting_level_ = level;
+    }
+
+    for (int64_t i = 0; i < size; ++i) {
+      // TODO(wesm): Error checking?
+      // TODO(wesm): Specialize for PyList_GET_ITEM?
+      OwnedRef item_ref(PySequence_GetItem(obj, i));
+      PyObject* item = item_ref.obj();
+
+      if (PyList_Check(item)) {
+        PY_RETURN_NOT_OK(Visit(item, level + 1));
+      } else if (PyDict_Check(item)) {
+        return Status::NotImplemented("No type inference for dicts");
+      } else {
+        // We permit nulls at any level of nesting
+        if (item == Py_None) {
+          // TODO
+        } else {
+          ++nesting_histogram_[level];
+          scalars_.Visit(item);
+        }
+      }
+    }
+    return Status::OK();
+  }
+
+  std::shared_ptr<DataType> GetType() {
+    if (scalars_.total_count() == 0) {
+      if (max_nesting_level_ == 0) {
+        return arrow::NA;
+      } else {
+        return nullptr;
+      }
+    } else {
+      std::shared_ptr<DataType> result = scalars_.GetType();
+      for (int i = 0; i < max_nesting_level_; ++i) {
+        result = std::make_shared<arrow::ListType>(result);
+      }
+      return result;
+    }
+  }
+
+  Status Validate() const {
+    if (scalars_.total_count() > 0) {
+      if (num_nesting_levels() > 1) {
+        return Status::ValueError("Mixed nesting levels not supported");
+      } else if (max_observed_level() < max_nesting_level_) {
+        return Status::ValueError("Mixed nesting levels not supported");
+      }
+    }
+    return Status::OK();
+  }
+
+  int max_observed_level() const {
+    int result = 0;
+    for (int i = 0; i < MAX_NESTING_LEVELS; ++i) {
+      if (nesting_histogram_[i] > 0) {
+        result = i;
+      }
+    }
+    return result;
+  }
+
+  int num_nesting_levels() const {
+    int result = 0;
+    for (int i = 0; i < MAX_NESTING_LEVELS; ++i) {
+      if (nesting_histogram_[i] > 0) {
+        ++result;
+      }
+    }
+    return result;
+  }
+
+ private:
+  ScalarVisitor scalars_;
+
+  // Track observed
+  int max_nesting_level_;
+  int nesting_histogram_[MAX_NESTING_LEVELS];
+};
+
+// Non-exhaustive type inference
+static Status InferArrowType(PyObject* obj, int64_t* size,
+    std::shared_ptr<DataType>* out_type) {
+  *size = PySequence_Size(obj);
+  if (PyErr_Occurred()) {
+    // Not a sequence
+    PyErr_Clear();
+    return Status::TypeError("Object is not a sequence");
+  }
+
+  // For 0-length sequences, refuse to guess
+  if (*size == 0) {
+    *out_type = arrow::NA;
+  }
+
+  SeqVisitor seq_visitor;
+  PY_RETURN_NOT_OK(seq_visitor.Visit(obj));
+  PY_RETURN_NOT_OK(seq_visitor.Validate());
+
+  *out_type = seq_visitor.GetType();
+  return Status::OK();
+}
+
+// Marshal Python sequence (list, tuple, etc.) to Arrow array
+class SeqConverter {
+ public:
+  virtual Status Init(const std::shared_ptr<ArrayBuilder>& builder) {
+    builder_ = builder;
+    return Status::OK();
+  }
+
+  virtual Status AppendData(PyObject* seq) = 0;
+
+ protected:
+  std::shared_ptr<ArrayBuilder> builder_;
+};
+
+template <typename BuilderType>
+class TypedConverter : public SeqConverter {
+ public:
+  Status Init(const std::shared_ptr<ArrayBuilder>& builder) override {
+    builder_ = builder;
+    typed_builder_ = static_cast<BuilderType*>(builder.get());
+    return Status::OK();
+  }
+
+ protected:
+  BuilderType* typed_builder_;
+};
+
+class BoolConverter : public TypedConverter<arrow::BooleanBuilder> {
+ public:
+  Status AppendData(PyObject* seq) override {
+    return Status::OK();
+  }
+};
+
+class Int64Converter : public TypedConverter<arrow::Int64Builder> {
+ public:
+  Status AppendData(PyObject* seq) override {
+    int64_t val;
+    Py_ssize_t size = PySequence_Size(seq);
+    for (int64_t i = 0; i < size; ++i) {
+      OwnedRef item(PySequence_GetItem(seq, i));
+      if (item.obj() == Py_None) {
+        RETURN_ARROW_NOT_OK(typed_builder_->AppendNull());
+      } else {
+        val = PyLong_AsLongLong(item.obj());
+        RETURN_IF_PYERROR();
+        RETURN_ARROW_NOT_OK(typed_builder_->Append(val));
+      }
+    }
+    return Status::OK();
+  }
+};
+
+class DoubleConverter : public TypedConverter<arrow::DoubleBuilder> {
+ public:
+  Status AppendData(PyObject* seq) override {
+    int64_t val;
+    Py_ssize_t size = PySequence_Size(seq);
+    for (int64_t i = 0; i < size; ++i) {
+      OwnedRef item(PySequence_GetItem(seq, i));
+      if (item.obj() == Py_None) {
+        RETURN_ARROW_NOT_OK(typed_builder_->AppendNull());
+      } else {
+        val = PyFloat_AsDouble(item.obj());
+        RETURN_IF_PYERROR();
+        RETURN_ARROW_NOT_OK(typed_builder_->Append(val));
+      }
+    }
+    return Status::OK();
+  }
+};
+
+class StringConverter : public TypedConverter<arrow::StringBuilder> {
+ public:
+  Status AppendData(PyObject* seq) override {
+    PyObject* item;
+    PyObject* bytes_obj;
+    OwnedRef tmp;
+    const char* bytes;
+    int32_t length;
+    Py_ssize_t size = PySequence_Size(seq);
+    for (int64_t i = 0; i < size; ++i) {
+      item = PySequence_GetItem(seq, i);
+      OwnedRef holder(item);
+
+      if (item == Py_None) {
+        RETURN_ARROW_NOT_OK(typed_builder_->AppendNull());
+        continue;
+      } else if (PyUnicode_Check(item)) {
+        tmp.reset(PyUnicode_AsUTF8String(item));
+        RETURN_IF_PYERROR();
+        bytes_obj = tmp.obj();
+      } else if (PyBytes_Check(item)) {
+        bytes_obj = item;
+      } else {
+        return Status::TypeError("Non-string value encountered");
+      }
+      // No error checking
+      length = PyBytes_GET_SIZE(bytes_obj);
+      bytes = PyBytes_AS_STRING(bytes_obj);
+      RETURN_ARROW_NOT_OK(typed_builder_->Append(bytes, length));
+    }
+    return Status::OK();
+  }
+};
+
+class ListConverter : public TypedConverter<arrow::ListBuilder> {
+ public:
+  Status Init(const std::shared_ptr<ArrayBuilder>& builder) override;
+
+  Status AppendData(PyObject* seq) override {
+    Py_ssize_t size = PySequence_Size(seq);
+    for (int64_t i = 0; i < size; ++i) {
+      OwnedRef item(PySequence_GetItem(seq, i));
+      if (item.obj() == Py_None) {
+        RETURN_ARROW_NOT_OK(typed_builder_->AppendNull());
+      } else {
+        typed_builder_->Append();
+        PY_RETURN_NOT_OK(value_converter_->AppendData(item.obj()));
+      }
+    }
+    return Status::OK();
+  }
+ protected:
+  std::shared_ptr<SeqConverter> value_converter_;
+};
+
+// Dynamic constructor for sequence converters
+std::shared_ptr<SeqConverter> GetConverter(const std::shared_ptr<DataType>&
type) {
+  switch (type->type) {
+    case LogicalType::BOOL:
+      return std::make_shared<BoolConverter>();
+    case LogicalType::INT64:
+      return std::make_shared<Int64Converter>();
+    case LogicalType::DOUBLE:
+      return std::make_shared<DoubleConverter>();
+    case LogicalType::STRING:
+      return std::make_shared<StringConverter>();
+    case LogicalType::LIST:
+      return std::make_shared<ListConverter>();
+    case LogicalType::STRUCT:
+    default:
+      return nullptr;
+      break;
+  }
+}
+
+Status ListConverter::Init(const std::shared_ptr<ArrayBuilder>& builder) {
+  builder_ = builder;
+  typed_builder_ = static_cast<arrow::ListBuilder*>(builder.get());
+
+  value_converter_ = GetConverter(static_cast<arrow::ListType*>(
+          builder->type().get())->value_type);
+  if (value_converter_ == nullptr) {
+    return Status::NotImplemented("value type not implemented");
+  }
+
+  value_converter_->Init(typed_builder_->value_builder());
+  return Status::OK();
+}
+
+Status ConvertPySequence(PyObject* obj, std::shared_ptr<arrow::Array>* out) {
+  std::shared_ptr<DataType> type;
+  int64_t size;
+  PY_RETURN_NOT_OK(InferArrowType(obj, &size, &type));
+
+  // Handle NA / NullType case
+  if (type->type == LogicalType::NA) {
+    out->reset(new arrow::Array(type, size, size));
+    return Status::OK();
+  }
+
+  std::shared_ptr<SeqConverter> converter = GetConverter(type);
+  if (converter == nullptr) {
+    std::stringstream ss;
+    ss << "No type converter implemented for "
+       << type->ToString();
+    return Status::NotImplemented(ss.str());
+  }
+
+  // Give the sequence converter an array builder
+  std::shared_ptr<ArrayBuilder> builder;
+  RETURN_ARROW_NOT_OK(arrow::MakeBuilder(GetMemoryPool(), type, &builder));
+  converter->Init(builder);
+
+  PY_RETURN_NOT_OK(converter->AppendData(obj));
+
+  *out = builder->Finish();
+
+  return Status::OK();
+}
+
+} // namespace pyarrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/9afb6677/python/src/pyarrow/adapters/builtin.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/builtin.h b/python/src/pyarrow/adapters/builtin.h
new file mode 100644
index 0000000..24886f4
--- /dev/null
+++ b/python/src/pyarrow/adapters/builtin.h
@@ -0,0 +1,40 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Functions for converting between CPython built-in data structures and Arrow
+// data structures
+
+#ifndef PYARROW_ADAPTERS_BUILTIN_H
+#define PYARROW_ADAPTERS_BUILTIN_H
+
+#include <Python.h>
+
+#include <memory>
+
+#include "pyarrow/common.h"
+
+namespace arrow { class Array; }
+
+namespace pyarrow {
+
+class Status;
+
+Status ConvertPySequence(PyObject* obj, std::shared_ptr<arrow::Array>* out);
+
+} // namespace pyarrow
+
+#endif // PYARROW_ADAPTERS_BUILTIN_H

http://git-wip-us.apache.org/repos/asf/arrow/blob/9afb6677/python/src/pyarrow/adapters/pandas.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/pandas.h b/python/src/pyarrow/adapters/pandas.h
new file mode 100644
index 0000000..a4f4163
--- /dev/null
+++ b/python/src/pyarrow/adapters/pandas.h
@@ -0,0 +1,28 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Functions for converting between pandas's NumPy-based data representation
+// and Arrow data structures
+
+#ifndef PYARROW_ADAPTERS_PANDAS_H
+#define PYARROW_ADAPTERS_PANDAS_H
+
+namespace pyarrow {
+
+} // namespace pyarrow
+
+#endif // PYARROW_ADAPTERS_PANDAS_H

http://git-wip-us.apache.org/repos/asf/arrow/blob/9afb6677/python/src/pyarrow/api.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/api.h b/python/src/pyarrow/api.h
index c2285de..72be6af 100644
--- a/python/src/pyarrow/api.h
+++ b/python/src/pyarrow/api.h
@@ -18,4 +18,11 @@
 #ifndef PYARROW_API_H
 #define PYARROW_API_H
 
+#include "pyarrow/status.h"
+
+#include "pyarrow/helpers.h"
+
+#include "pyarrow/adapters/builtin.h"
+#include "pyarrow/adapters/pandas.h"
+
 #endif // PYARROW_API_H

http://git-wip-us.apache.org/repos/asf/arrow/blob/9afb6677/python/src/pyarrow/common.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/common.cc b/python/src/pyarrow/common.cc
new file mode 100644
index 0000000..a2748f9
--- /dev/null
+++ b/python/src/pyarrow/common.cc
@@ -0,0 +1,71 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "pyarrow/common.h"
+
+#include <cstdlib>
+#include <mutex>
+#include <sstream>
+
+#include <arrow/util/memory-pool.h>
+#include <arrow/util/status.h>
+
+#include "pyarrow/status.h"
+
+namespace pyarrow {
+
+class PyArrowMemoryPool : public arrow::MemoryPool {
+ public:
+  PyArrowMemoryPool() : bytes_allocated_(0) {}
+  virtual ~PyArrowMemoryPool() {}
+
+  arrow::Status Allocate(int64_t size, uint8_t** out) override {
+    std::lock_guard<std::mutex> guard(pool_lock_);
+    *out = static_cast<uint8_t*>(std::malloc(size));
+    if (*out == nullptr) {
+      std::stringstream ss;
+      ss << "malloc of size " << size << " failed";
+      return arrow::Status::OutOfMemory(ss.str());
+    }
+
+    bytes_allocated_ += size;
+
+    return arrow::Status::OK();
+  }
+
+  int64_t bytes_allocated() const override {
+    std::lock_guard<std::mutex> guard(pool_lock_);
+    return bytes_allocated_;
+  }
+
+  void Free(uint8_t* buffer, int64_t size) override {
+    std::lock_guard<std::mutex> guard(pool_lock_);
+    std::free(buffer);
+    bytes_allocated_ -= size;
+  }
+
+ private:
+  mutable std::mutex pool_lock_;
+  int64_t bytes_allocated_;
+};
+
+arrow::MemoryPool* GetMemoryPool() {
+  static PyArrowMemoryPool memory_pool;
+  return &memory_pool;
+}
+
+} // namespace pyarrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/9afb6677/python/src/pyarrow/common.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/common.h b/python/src/pyarrow/common.h
new file mode 100644
index 0000000..a43e4d2
--- /dev/null
+++ b/python/src/pyarrow/common.h
@@ -0,0 +1,95 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef PYARROW_COMMON_H
+#define PYARROW_COMMON_H
+
+#include <Python.h>
+
+namespace arrow { class MemoryPool; }
+
+namespace pyarrow {
+
+#define PYARROW_IS_PY2 PY_MAJOR_VERSION < 2
+
+#define RETURN_ARROW_NOT_OK(s) do {             \
+    arrow::Status _s = (s);                     \
+    if (!_s.ok()) {                             \
+      return Status::ArrowError(s.ToString());  \
+    }                                           \
+  } while (0);
+
+class OwnedRef {
+ public:
+  OwnedRef() : obj_(nullptr) {}
+
+  OwnedRef(PyObject* obj) :
+      obj_(obj) {}
+
+  ~OwnedRef() {
+    Py_XDECREF(obj_);
+  }
+
+  void reset(PyObject* obj) {
+    if (obj_ != nullptr) {
+      Py_XDECREF(obj_);
+    }
+    obj_ = obj;
+  }
+
+  PyObject* obj() const{
+    return obj_;
+  }
+
+ private:
+  PyObject* obj_;
+};
+
+struct PyObjectStringify {
+  OwnedRef tmp_obj;
+  const char* bytes;
+
+  PyObjectStringify(PyObject* obj) {
+    PyObject* bytes_obj;
+    if (PyUnicode_Check(obj)) {
+      bytes_obj = PyUnicode_AsUTF8String(obj);
+      tmp_obj.reset(bytes_obj);
+    } else {
+      bytes_obj = obj;
+    }
+    bytes = PyBytes_AsString(bytes_obj);
+  }
+};
+
+// TODO(wesm): We can just let errors pass through. To be explored later
+#define RETURN_IF_PYERROR()                         \
+  if (PyErr_Occurred()) {                           \
+    PyObject *exc_type, *exc_value, *traceback;     \
+    PyErr_Fetch(&exc_type, &exc_value, &traceback); \
+    PyObjectStringify stringified(exc_value);       \
+    std::string message(stringified.bytes);         \
+    Py_DECREF(exc_type);                            \
+    Py_DECREF(exc_value);                           \
+    Py_DECREF(traceback);                           \
+    return Status::UnknownError(message);           \
+  }
+
+arrow::MemoryPool* GetMemoryPool();
+
+} // namespace pyarrow
+
+#endif // PYARROW_COMMON_H

http://git-wip-us.apache.org/repos/asf/arrow/blob/9afb6677/python/src/pyarrow/helpers.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/helpers.cc b/python/src/pyarrow/helpers.cc
new file mode 100644
index 0000000..d0969da
--- /dev/null
+++ b/python/src/pyarrow/helpers.cc
@@ -0,0 +1,57 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "pyarrow/helpers.h"
+
+#include <arrow/api.h>
+
+using namespace arrow;
+
+namespace pyarrow {
+
+#define GET_PRIMITIVE_TYPE(NAME, Type)          \
+  case LogicalType::NAME:                       \
+    if (nullable) {                             \
+      return NAME;                              \
+    } else {                                    \
+      return std::make_shared<Type>(nullable);  \
+    }                                           \
+    break;
+
+std::shared_ptr<DataType> GetPrimitiveType(LogicalType::type type,
+    bool nullable) {
+  switch (type) {
+    case LogicalType::NA:
+      return NA;
+    GET_PRIMITIVE_TYPE(UINT8, UInt8Type);
+    GET_PRIMITIVE_TYPE(INT8, Int8Type);
+    GET_PRIMITIVE_TYPE(UINT16, UInt16Type);
+    GET_PRIMITIVE_TYPE(INT16, Int16Type);
+    GET_PRIMITIVE_TYPE(UINT32, UInt32Type);
+    GET_PRIMITIVE_TYPE(INT32, Int32Type);
+    GET_PRIMITIVE_TYPE(UINT64, UInt64Type);
+    GET_PRIMITIVE_TYPE(INT64, Int64Type);
+    GET_PRIMITIVE_TYPE(BOOL, BooleanType);
+    GET_PRIMITIVE_TYPE(FLOAT, FloatType);
+    GET_PRIMITIVE_TYPE(DOUBLE, DoubleType);
+    GET_PRIMITIVE_TYPE(STRING, StringType);
+    default:
+      return nullptr;
+  }
+}
+
+} // namespace pyarrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/9afb6677/python/src/pyarrow/helpers.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/helpers.h b/python/src/pyarrow/helpers.h
new file mode 100644
index 0000000..1a24f05
--- /dev/null
+++ b/python/src/pyarrow/helpers.h
@@ -0,0 +1,34 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef PYARROW_HELPERS_H
+#define PYARROW_HELPERS_H
+
+#include <arrow/api.h>
+#include <memory>
+
+namespace pyarrow {
+
+using arrow::DataType;
+using arrow::LogicalType;
+
+std::shared_ptr<DataType> GetPrimitiveType(LogicalType::type type,
+    bool nullable);
+
+} // namespace pyarrow
+
+#endif // PYARROW_HELPERS_H

http://git-wip-us.apache.org/repos/asf/arrow/blob/9afb6677/python/src/pyarrow/init.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/init.cc b/python/src/pyarrow/init.cc
index c36f413..acd851e 100644
--- a/python/src/pyarrow/init.cc
+++ b/python/src/pyarrow/init.cc
@@ -17,13 +17,9 @@
 
 #include "pyarrow/init.h"
 
-namespace arrow {
-
-namespace py {
+namespace pyarrow {
 
 void pyarrow_init() {
 }
 
-} // namespace py
-
-} // namespace arrow
+} // namespace pyarrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/9afb6677/python/src/pyarrow/init.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/init.h b/python/src/pyarrow/init.h
index 1fc9f10..71e67a2 100644
--- a/python/src/pyarrow/init.h
+++ b/python/src/pyarrow/init.h
@@ -18,14 +18,10 @@
 #ifndef PYARROW_INIT_H
 #define PYARROW_INIT_H
 
-namespace arrow {
-
-namespace py {
+namespace pyarrow {
 
 void pyarrow_init();
 
-} // namespace py
-
-} // namespace arrow
+} // namespace pyarrow
 
 #endif // PYARROW_INIT_H

http://git-wip-us.apache.org/repos/asf/arrow/blob/9afb6677/python/src/pyarrow/status.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/status.cc b/python/src/pyarrow/status.cc
new file mode 100644
index 0000000..1cd54f6
--- /dev/null
+++ b/python/src/pyarrow/status.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A Status encapsulates the result of an operation.  It may indicate success,
+// or it may indicate an error with an associated error message.
+//
+// Multiple threads can invoke const methods on a Status without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Status must use
+// external synchronization.
+
+#include "pyarrow/status.h"
+
+#include <assert.h>
+#include <cstdint>
+#include <cstring>
+
+namespace pyarrow {
+
+Status::Status(StatusCode code, const std::string& msg, int16_t posix_code) {
+  assert(code != StatusCode::OK);
+  const uint32_t size = msg.size();
+  char* result = new char[size + 7];
+  memcpy(result, &size, sizeof(size));
+  result[4] = static_cast<char>(code);
+  memcpy(result + 5, &posix_code, sizeof(posix_code));
+  memcpy(result + 7, msg.c_str(), msg.size());
+  state_ = result;
+}
+
+const char* Status::CopyState(const char* state) {
+  uint32_t size;
+  memcpy(&size, state, sizeof(size));
+  char* result = new char[size + 7];
+  memcpy(result, state, size + 7);
+  return result;
+}
+
+std::string Status::CodeAsString() const {
+  if (state_ == NULL) {
+    return "OK";
+  }
+
+  const char* type;
+  switch (code()) {
+    case StatusCode::OK:
+      type = "OK";
+      break;
+    case StatusCode::OutOfMemory:
+      type = "Out of memory";
+      break;
+    case StatusCode::KeyError:
+      type = "Key error";
+      break;
+    case StatusCode::TypeError:
+      type = "Value error";
+      break;
+    case StatusCode::ValueError:
+      type = "Value error";
+      break;
+    case StatusCode::IOError:
+      type = "IO error";
+      break;
+    case StatusCode::NotImplemented:
+      type = "Not implemented";
+      break;
+    case StatusCode::ArrowError:
+      type = "Arrow C++ error";
+      break;
+    case StatusCode::UnknownError:
+      type = "Unknown error";
+      break;
+  }
+  return std::string(type);
+}
+
+std::string Status::ToString() const {
+  std::string result(CodeAsString());
+  if (state_ == NULL) {
+    return result;
+  }
+
+  result.append(": ");
+
+  uint32_t length;
+  memcpy(&length, state_, sizeof(length));
+  result.append(reinterpret_cast<const char*>(state_ + 7), length);
+  return result;
+}
+
+} // namespace pyarrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/9afb6677/python/src/pyarrow/status.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/status.h b/python/src/pyarrow/status.h
new file mode 100644
index 0000000..cb8c8ad
--- /dev/null
+++ b/python/src/pyarrow/status.h
@@ -0,0 +1,144 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A Status encapsulates the result of an operation.  It may indicate success,
+// or it may indicate an error with an associated error message.
+//
+// Multiple threads can invoke const methods on a Status without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Status must use
+// external synchronization.
+
+#ifndef PYARROW_STATUS_H_
+#define PYARROW_STATUS_H_
+
+#include <cstdint>
+#include <cstring>
+#include <string>
+
+namespace pyarrow {
+
+#define PY_RETURN_NOT_OK(s) do {                \
+    Status _s = (s);                            \
+    if (!_s.ok()) return _s;                    \
+  } while (0);
+
+enum class StatusCode: char {
+  OK = 0,
+  OutOfMemory = 1,
+  KeyError = 2,
+  TypeError = 3,
+  ValueError = 4,
+  IOError = 5,
+  NotImplemented = 6,
+
+  ArrowError = 7,
+
+  UnknownError = 10
+};
+
+class Status {
+ public:
+  // Create a success status.
+  Status() : state_(NULL) { }
+  ~Status() { delete[] state_; }
+
+  // Copy the specified status.
+  Status(const Status& s);
+  void operator=(const Status& s);
+
+  // Return a success status.
+  static Status OK() { return Status(); }
+
+  // Return error status of an appropriate type.
+  static Status OutOfMemory(const std::string& msg, int16_t posix_code = -1) {
+    return Status(StatusCode::OutOfMemory, msg, posix_code);
+  }
+
+  static Status KeyError(const std::string& msg) {
+    return Status(StatusCode::KeyError, msg, -1);
+  }
+
+  static Status TypeError(const std::string& msg) {
+    return Status(StatusCode::TypeError, msg, -1);
+  }
+
+  static Status IOError(const std::string& msg) {
+    return Status(StatusCode::IOError, msg, -1);
+  }
+
+  static Status ValueError(const std::string& msg) {
+    return Status(StatusCode::ValueError, msg, -1);
+  }
+
+  static Status NotImplemented(const std::string& msg) {
+    return Status(StatusCode::NotImplemented, msg, -1);
+  }
+
+  static Status UnknownError(const std::string& msg) {
+    return Status(StatusCode::UnknownError, msg, -1);
+  }
+
+  static Status ArrowError(const std::string& msg) {
+    return Status(StatusCode::ArrowError, msg, -1);
+  }
+
+  // Returns true iff the status indicates success.
+  bool ok() const { return (state_ == NULL); }
+
+  bool IsOutOfMemory() const { return code() == StatusCode::OutOfMemory; }
+  bool IsKeyError() const { return code() == StatusCode::KeyError; }
+  bool IsIOError() const { return code() == StatusCode::IOError; }
+  bool IsTypeError() const { return code() == StatusCode::TypeError; }
+  bool IsValueError() const { return code() == StatusCode::ValueError; }
+
+  bool IsUnknownError() const { return code() == StatusCode::UnknownError; }
+
+  bool IsArrowError() const { return code() == StatusCode::ArrowError; }
+
+  // Return a string representation of this status suitable for printing.
+  // Returns the string "OK" for success.
+  std::string ToString() const;
+
+  // Return a string representation of the status code, without the message
+  // text or posix code information.
+  std::string CodeAsString() const;
+
+  // Get the POSIX code associated with this Status, or -1 if there is none.
+  int16_t posix_code() const;
+
+ private:
+  // OK status has a NULL state_.  Otherwise, state_ is a new[] array
+  // of the following form:
+  //    state_[0..3] == length of message
+  //    state_[4]    == code
+  //    state_[5..6] == posix_code
+  //    state_[7..]  == message
+  const char* state_;
+
+  StatusCode code() const {
+    return ((state_ == NULL) ?
+        StatusCode::OK : static_cast<StatusCode>(state_[4]));
+  }
+
+  Status(StatusCode code, const std::string& msg, int16_t posix_code);
+  static const char* CopyState(const char* s);
+};
+
+inline Status::Status(const Status& s) {
+  state_ = (s.state_ == NULL) ? NULL : CopyState(s.state_);
+}
+
+inline void Status::operator=(const Status& s) {
+  // The following condition catches both aliasing (when this == &s),
+  // and the common case where both s and *this are ok.
+  if (state_ != s.state_) {
+    delete[] state_;
+    state_ = (s.state_ == NULL) ? NULL : CopyState(s.state_);
+  }
+}
+
+}  // namespace pyarrow
+
+#endif // PYARROW_STATUS_H_


Mime
View raw message