arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject arrow git commit: ARROW-242: Support Timestamp Data Type
Date Sun, 28 Aug 2016 19:25:46 GMT
Repository: arrow
Updated Branches:
  refs/heads/master e081a4c27 -> 0a411fd29


ARROW-242: Support Timestamp Data Type

For the Pandas<->Parquet bridge this is a lossy conversion but must be explicitly activated
by the user.

Regarding Parquet 1.0: Yes, the logical type is not supported but should be simply ignored
by the reader. Implementation for INT96 timestamps is not in the scope of this PR.

Author: Uwe L. Korn <uwelk@xhochy.com>

Closes #107 from xhochy/arrow-242 and squashes the following commits:

8db6968 [Uwe L. Korn] Add missing include
34126b1 [Uwe L. Korn] ARROW-242: Support Timestamp Data Type


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/0a411fd2
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/0a411fd2
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/0a411fd2

Branch: refs/heads/master
Commit: 0a411fd29ed1baac6f1524be82fc15e08f2b28db
Parents: e081a4c
Author: Uwe L. Korn <uwelk@xhochy.com>
Authored: Sun Aug 28 15:25:35 2016 -0400
Committer: Wes McKinney <wesm@apache.org>
Committed: Sun Aug 28 15:25:35 2016 -0400

----------------------------------------------------------------------
 .../arrow/parquet/parquet-reader-writer-test.cc |  12 ++-
 cpp/src/arrow/parquet/parquet-schema-test.cc    |  23 +++-
 cpp/src/arrow/parquet/reader.cc                 |   1 +
 cpp/src/arrow/parquet/schema.cc                 |  13 ++-
 cpp/src/arrow/parquet/writer.cc                 |   1 +
 cpp/src/arrow/types/construct.cc                |   3 +-
 cpp/src/arrow/types/datetime.h                  |  12 ++-
 cpp/src/arrow/types/primitive.cc                |   1 +
 cpp/src/arrow/types/primitive.h                 |  11 ++
 python/pyarrow/array.pyx                        |  40 ++++++-
 python/pyarrow/includes/libarrow.pxd            |   1 +
 python/pyarrow/tests/test_convert_pandas.py     |  24 ++++-
 python/pyarrow/tests/test_parquet.py            |   4 +-
 python/src/pyarrow/adapters/pandas.cc           | 107 +++++++++++++++++--
 14 files changed, 232 insertions(+), 21 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/0a411fd2/cpp/src/arrow/parquet/parquet-reader-writer-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/parquet/parquet-reader-writer-test.cc b/cpp/src/arrow/parquet/parquet-reader-writer-test.cc
index bfc27d2..d7b39dd 100644
--- a/cpp/src/arrow/parquet/parquet-reader-writer-test.cc
+++ b/cpp/src/arrow/parquet/parquet-reader-writer-test.cc
@@ -138,6 +138,15 @@ struct test_traits<Int64Type> {
 const int64_t test_traits<Int64Type>::value(-1024);
 
 template <>
+struct test_traits<TimestampType> {
+  static constexpr ParquetType::type parquet_enum = ParquetType::INT64;
+  static constexpr LogicalType::type logical_enum = LogicalType::TIMESTAMP_MILLIS;
+  static int64_t const value;
+};
+
+const int64_t test_traits<TimestampType>::value(14695634030000);
+
+template <>
 struct test_traits<FloatType> {
   static constexpr ParquetType::type parquet_enum = ParquetType::FLOAT;
   static constexpr LogicalType::type logical_enum = LogicalType::NONE;
@@ -248,7 +257,8 @@ class TestParquetIO : public ::testing::Test {
 // Parquet version 1.0.
 
 typedef ::testing::Types<BooleanType, UInt8Type, Int8Type, UInt16Type, Int16Type,
-    Int32Type, UInt64Type, Int64Type, FloatType, DoubleType, StringType> TestTypes;
+    Int32Type, UInt64Type, Int64Type, TimestampType, FloatType, DoubleType,
+    StringType> TestTypes;
 
 TYPED_TEST_CASE(TestParquetIO, TestTypes);
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/0a411fd2/cpp/src/arrow/parquet/parquet-schema-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/parquet/parquet-schema-test.cc b/cpp/src/arrow/parquet/parquet-schema-test.cc
index 819cdd3..a2bcd3e 100644
--- a/cpp/src/arrow/parquet/parquet-schema-test.cc
+++ b/cpp/src/arrow/parquet/parquet-schema-test.cc
@@ -22,6 +22,7 @@
 
 #include "arrow/test-util.h"
 #include "arrow/type.h"
+#include "arrow/types/datetime.h"
 #include "arrow/types/decimal.h"
 #include "arrow/util/status.h"
 
@@ -45,6 +46,9 @@ const auto INT64 = std::make_shared<Int64Type>();
 const auto FLOAT = std::make_shared<FloatType>();
 const auto DOUBLE = std::make_shared<DoubleType>();
 const auto UTF8 = std::make_shared<StringType>();
+const auto TIMESTAMP_MS = std::make_shared<TimestampType>(TimestampType::Unit::MILLI);
+// TODO: This requires parquet-cpp implementing the MICROS enum value
+// const auto TIMESTAMP_US = std::make_shared<TimestampType>(TimestampType::Unit::MICRO);
 const auto BINARY = std::make_shared<ListType>(std::make_shared<Field>("", UINT8));
 const auto DECIMAL_8_4 = std::make_shared<DecimalType>(8, 4);
 
@@ -89,6 +93,14 @@ TEST_F(TestConvertParquetSchema, ParquetFlatPrimitives) {
       PrimitiveNode::Make("int64", Repetition::REQUIRED, ParquetType::INT64));
   arrow_fields.push_back(std::make_shared<Field>("int64", INT64, false));
 
+  parquet_fields.push_back(PrimitiveNode::Make("timestamp", Repetition::REQUIRED,
+      ParquetType::INT64, LogicalType::TIMESTAMP_MILLIS));
+  arrow_fields.push_back(std::make_shared<Field>("timestamp", TIMESTAMP_MS, false));
+
+  // parquet_fields.push_back(PrimitiveNode::Make("timestamp", Repetition::REQUIRED,
+  //     ParquetType::INT64, LogicalType::TIMESTAMP_MICROS));
+  // arrow_fields.push_back(std::make_shared<Field>("timestamp", TIMESTAMP_US, false));
+
   parquet_fields.push_back(
       PrimitiveNode::Make("float", Repetition::OPTIONAL, ParquetType::FLOAT));
   arrow_fields.push_back(std::make_shared<Field>("float", FLOAT));
@@ -153,9 +165,6 @@ TEST_F(TestConvertParquetSchema, UnsupportedThings) {
   unsupported_nodes.push_back(PrimitiveNode::Make(
       "int32", Repetition::OPTIONAL, ParquetType::INT32, LogicalType::DATE));
 
-  unsupported_nodes.push_back(PrimitiveNode::Make(
-      "int64", Repetition::OPTIONAL, ParquetType::INT64, LogicalType::TIMESTAMP_MILLIS));
-
   for (const NodePtr& node : unsupported_nodes) {
     ASSERT_RAISES(NotImplemented, ConvertSchema({node}));
   }
@@ -209,6 +218,14 @@ TEST_F(TestConvertArrowSchema, ParquetFlatPrimitives) {
       PrimitiveNode::Make("int64", Repetition::REQUIRED, ParquetType::INT64));
   arrow_fields.push_back(std::make_shared<Field>("int64", INT64, false));
 
+  parquet_fields.push_back(PrimitiveNode::Make("timestamp", Repetition::REQUIRED,
+      ParquetType::INT64, LogicalType::TIMESTAMP_MILLIS));
+  arrow_fields.push_back(std::make_shared<Field>("timestamp", TIMESTAMP_MS, false));
+
+  // parquet_fields.push_back(PrimitiveNode::Make("timestamp", Repetition::REQUIRED,
+  //     ParquetType::INT64, LogicalType::TIMESTAMP_MICROS));
+  // arrow_fields.push_back(std::make_shared<Field>("timestamp", TIMESTAMP_US, false));
+
   parquet_fields.push_back(
       PrimitiveNode::Make("float", Repetition::OPTIONAL, ParquetType::FLOAT));
   arrow_fields.push_back(std::make_shared<Field>("float", FLOAT));

http://git-wip-us.apache.org/repos/asf/arrow/blob/0a411fd2/cpp/src/arrow/parquet/reader.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/parquet/reader.cc b/cpp/src/arrow/parquet/reader.cc
index e92967e..9f62125 100644
--- a/cpp/src/arrow/parquet/reader.cc
+++ b/cpp/src/arrow/parquet/reader.cc
@@ -368,6 +368,7 @@ Status FlatColumnReader::Impl::NextBatch(int batch_size, std::shared_ptr<Array>*
     TYPED_BATCH_CASE(FLOAT, FloatType, ::parquet::FloatType)
     TYPED_BATCH_CASE(DOUBLE, DoubleType, ::parquet::DoubleType)
     TYPED_BATCH_CASE(STRING, StringType, ::parquet::ByteArrayType)
+    TYPED_BATCH_CASE(TIMESTAMP, TimestampType, ::parquet::Int64Type)
     default:
       return Status::NotImplemented(field_->type->ToString());
   }

http://git-wip-us.apache.org/repos/asf/arrow/blob/0a411fd2/cpp/src/arrow/parquet/schema.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/parquet/schema.cc b/cpp/src/arrow/parquet/schema.cc
index a79342a..cd91df3 100644
--- a/cpp/src/arrow/parquet/schema.cc
+++ b/cpp/src/arrow/parquet/schema.cc
@@ -52,6 +52,7 @@ const auto INT64 = std::make_shared<Int64Type>();
 const auto FLOAT = std::make_shared<FloatType>();
 const auto DOUBLE = std::make_shared<DoubleType>();
 const auto UTF8 = std::make_shared<StringType>();
+const auto TIMESTAMP_MS = std::make_shared<TimestampType>(TimestampType::Unit::MILLI);
 const auto BINARY = std::make_shared<ListType>(std::make_shared<Field>("", UINT8));
 
 TypePtr MakeDecimalType(const PrimitiveNode* node) {
@@ -133,6 +134,9 @@ static Status FromInt64(const PrimitiveNode* node, TypePtr* out) {
     case LogicalType::DECIMAL:
       *out = MakeDecimalType(node);
       break;
+    case LogicalType::TIMESTAMP_MILLIS:
+      *out = TIMESTAMP_MS;
+      break;
     default:
       return Status::NotImplemented("Unhandled logical type for int64");
       break;
@@ -289,10 +293,15 @@ Status FieldToNode(const std::shared_ptr<Field>& field,
       type = ParquetType::INT32;
       logical_type = LogicalType::DATE;
       break;
-    case Type::TIMESTAMP:
+    case Type::TIMESTAMP: {
+      auto timestamp_type = static_cast<TimestampType*>(field->type.get());
+      if (timestamp_type->unit != TimestampType::Unit::MILLI) {
+        return Status::NotImplemented(
+            "Other timestamp units than millisecond are not yet support with parquet.");
+      }
       type = ParquetType::INT64;
       logical_type = LogicalType::TIMESTAMP_MILLIS;
-      break;
+    } break;
     case Type::TIMESTAMP_DOUBLE:
       type = ParquetType::INT64;
       // This is specified as seconds since the UNIX epoch

http://git-wip-us.apache.org/repos/asf/arrow/blob/0a411fd2/cpp/src/arrow/parquet/writer.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/parquet/writer.cc b/cpp/src/arrow/parquet/writer.cc
index f9514aa..ddee573 100644
--- a/cpp/src/arrow/parquet/writer.cc
+++ b/cpp/src/arrow/parquet/writer.cc
@@ -240,6 +240,7 @@ Status FileWriter::Impl::WriteFlatColumnChunk(
       TYPED_BATCH_CASE(INT32, Int32Type, ::parquet::Int32Type)
       TYPED_BATCH_CASE(UINT64, UInt64Type, ::parquet::Int64Type)
       TYPED_BATCH_CASE(INT64, Int64Type, ::parquet::Int64Type)
+      TYPED_BATCH_CASE(TIMESTAMP, TimestampType, ::parquet::Int64Type)
       TYPED_BATCH_CASE(FLOAT, FloatType, ::parquet::FloatType)
       TYPED_BATCH_CASE(DOUBLE, DoubleType, ::parquet::DoubleType)
     default:

http://git-wip-us.apache.org/repos/asf/arrow/blob/0a411fd2/cpp/src/arrow/types/construct.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc
index 5ae9c5a..0b71ea9 100644
--- a/cpp/src/arrow/types/construct.cc
+++ b/cpp/src/arrow/types/construct.cc
@@ -51,6 +51,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>&
type,
     BUILDER_CASE(INT32, Int32Builder);
     BUILDER_CASE(UINT64, UInt64Builder);
     BUILDER_CASE(INT64, Int64Builder);
+    BUILDER_CASE(TIMESTAMP, TimestampBuilder);
 
     BUILDER_CASE(BOOL, BooleanBuilder);
 
@@ -105,7 +106,7 @@ Status MakePrimitiveArray(const TypePtr& type, int32_t length,
     MAKE_PRIMITIVE_ARRAY_CASE(UINT64, UInt64Array);
     MAKE_PRIMITIVE_ARRAY_CASE(INT64, Int64Array);
     MAKE_PRIMITIVE_ARRAY_CASE(TIME, Int64Array);
-    MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP, Int64Array);
+    MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP, TimestampArray);
     MAKE_PRIMITIVE_ARRAY_CASE(FLOAT, FloatArray);
     MAKE_PRIMITIVE_ARRAY_CASE(DOUBLE, DoubleArray);
     MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP_DOUBLE, DoubleArray);

http://git-wip-us.apache.org/repos/asf/arrow/blob/0a411fd2/cpp/src/arrow/types/datetime.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/datetime.h b/cpp/src/arrow/types/datetime.h
index b782455..241a126 100644
--- a/cpp/src/arrow/types/datetime.h
+++ b/cpp/src/arrow/types/datetime.h
@@ -18,6 +18,8 @@
 #ifndef ARROW_TYPES_DATETIME_H
 #define ARROW_TYPES_DATETIME_H
 
+#include <string>
+
 #include "arrow/type.h"
 
 namespace arrow {
@@ -34,15 +36,23 @@ struct DateType : public DataType {
   static char const* name() { return "date"; }
 };
 
-struct TimestampType : public DataType {
+struct ARROW_EXPORT TimestampType : public DataType {
   enum class Unit : char { SECOND = 0, MILLI = 1, MICRO = 2, NANO = 3 };
 
+  typedef int64_t c_type;
+  static constexpr Type::type type_enum = Type::TIMESTAMP;
+
+  int value_size() const override { return sizeof(int64_t); }
+
   Unit unit;
 
   explicit TimestampType(Unit unit = Unit::MILLI)
       : DataType(Type::TIMESTAMP), unit(unit) {}
 
   TimestampType(const TimestampType& other) : TimestampType(other.unit) {}
+  virtual ~TimestampType() {}
+
+  std::string ToString() const override { return "timestamp"; }
 
   static char const* name() { return "timestamp"; }
 };

http://git-wip-us.apache.org/repos/asf/arrow/blob/0a411fd2/cpp/src/arrow/types/primitive.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc
index f4b47f9..375e94f 100644
--- a/cpp/src/arrow/types/primitive.cc
+++ b/cpp/src/arrow/types/primitive.cc
@@ -158,6 +158,7 @@ template class PrimitiveBuilder<Int8Type>;
 template class PrimitiveBuilder<Int16Type>;
 template class PrimitiveBuilder<Int32Type>;
 template class PrimitiveBuilder<Int64Type>;
+template class PrimitiveBuilder<TimestampType>;
 template class PrimitiveBuilder<FloatType>;
 template class PrimitiveBuilder<DoubleType>;
 template class PrimitiveBuilder<BooleanType>;

http://git-wip-us.apache.org/repos/asf/arrow/blob/0a411fd2/cpp/src/arrow/types/primitive.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h
index 770de76..c643783 100644
--- a/cpp/src/arrow/types/primitive.h
+++ b/cpp/src/arrow/types/primitive.h
@@ -26,6 +26,7 @@
 #include "arrow/array.h"
 #include "arrow/builder.h"
 #include "arrow/type.h"
+#include "arrow/types/datetime.h"
 #include "arrow/util/bit-util.h"
 #include "arrow/util/buffer.h"
 #include "arrow/util/status.h"
@@ -100,6 +101,7 @@ NUMERIC_ARRAY_DECL(UInt32Array, UInt32Type);
 NUMERIC_ARRAY_DECL(Int32Array, Int32Type);
 NUMERIC_ARRAY_DECL(UInt64Array, UInt64Type);
 NUMERIC_ARRAY_DECL(Int64Array, Int64Type);
+NUMERIC_ARRAY_DECL(TimestampArray, TimestampType);
 NUMERIC_ARRAY_DECL(FloatArray, FloatType);
 NUMERIC_ARRAY_DECL(DoubleArray, DoubleType);
 
@@ -235,7 +237,15 @@ struct type_traits<Int64Type> {
 
   static inline int bytes_required(int elements) { return elements * sizeof(int64_t); }
 };
+
+template <>
+struct type_traits<TimestampType> {
+  typedef TimestampArray ArrayType;
+
+  static inline int bytes_required(int elements) { return elements * sizeof(int64_t); }
+};
 template <>
+
 struct type_traits<FloatType> {
   typedef FloatArray ArrayType;
 
@@ -260,6 +270,7 @@ typedef NumericBuilder<Int8Type> Int8Builder;
 typedef NumericBuilder<Int16Type> Int16Builder;
 typedef NumericBuilder<Int32Type> Int32Builder;
 typedef NumericBuilder<Int64Type> Int64Builder;
+typedef NumericBuilder<TimestampType> TimestampBuilder;
 
 typedef NumericBuilder<FloatType> FloatBuilder;
 typedef NumericBuilder<DoubleType> DoubleBuilder;

http://git-wip-us.apache.org/repos/asf/arrow/blob/0a411fd2/python/pyarrow/array.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx
index 619e5ef..5229b42 100644
--- a/python/pyarrow/array.pyx
+++ b/python/pyarrow/array.pyx
@@ -19,6 +19,8 @@
 # distutils: language = c++
 # cython: embedsignature = True
 
+import numpy as np
+
 from pyarrow.includes.libarrow cimport *
 cimport pyarrow.includes.pyarrow as pyarrow
 
@@ -186,6 +188,7 @@ cdef dict _array_classes = {
     Type_DOUBLE: DoubleArray,
     Type_LIST: ListArray,
     Type_STRING: StringArray,
+    Type_TIMESTAMP: Int64Array,
 }
 
 cdef object box_arrow_array(const shared_ptr[CArray]& sp_array):
@@ -217,11 +220,28 @@ def from_pylist(object list_obj, DataType type=None):
     return box_arrow_array(sp_array)
 
 
-def from_pandas_series(object series, object mask=None):
+def from_pandas_series(object series, object mask=None, timestamps_to_ms=False):
+    """
+    Convert pandas.Series to an Arrow Array.
+
+    Parameters
+    ----------
+    series: pandas.Series or numpy.ndarray
+
+    mask: pandas.Series or numpy.ndarray
+        array to mask null entries in the series
+
+    timestamps_to_ms: bool
+        Convert datetime columns to ms resolution. This is needed for
+        compability with other functionality like Parquet I/O which
+        only supports milliseconds.
+    """
     cdef:
         shared_ptr[CArray] out
 
     series_values = series_as_ndarray(series)
+    if series_values.dtype.type == np.datetime64 and timestamps_to_ms:
+        series_values = series_values.astype('datetime64[ms]')
 
     if mask is None:
         check_status(pyarrow.PandasToArrow(pyarrow.GetMemoryPool(),
@@ -234,14 +254,28 @@ def from_pandas_series(object series, object mask=None):
     return box_arrow_array(out)
 
 
-def from_pandas_dataframe(object df, name=None):
+def from_pandas_dataframe(object df, name=None, timestamps_to_ms=False):
+    """
+    Convert pandas.DataFrame to an Arrow Table
+
+    Parameters
+    ----------
+    df: pandas.DataFrame
+
+    name: str
+
+    timestamps_to_ms: bool
+        Convert datetime columns to ms resolution. This is needed for
+        compability with other functionality like Parquet I/O which
+        only supports milliseconds.
+    """
     cdef:
         list names = []
         list arrays = []
 
     for name in df.columns:
         col = df[name]
-        arr = from_pandas_series(col)
+        arr = from_pandas_series(col, timestamps_to_ms=timestamps_to_ms)
 
         names.append(name)
         arrays.append(arr)

http://git-wip-us.apache.org/repos/asf/arrow/blob/0a411fd2/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 91ce069..854d07d 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -38,6 +38,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         Type_FLOAT" arrow::Type::FLOAT"
         Type_DOUBLE" arrow::Type::DOUBLE"
 
+        Type_TIMESTAMP" arrow::Type::TIMESTAMP"
         Type_STRING" arrow::Type::STRING"
 
         Type_LIST" arrow::Type::LIST"

http://git-wip-us.apache.org/repos/asf/arrow/blob/0a411fd2/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 6dc9c68..5530299 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -33,8 +33,9 @@ class TestPandasConversion(unittest.TestCase):
     def tearDown(self):
         pass
 
-    def _check_pandas_roundtrip(self, df, expected=None):
-        table = A.from_pandas_dataframe(df)
+    def _check_pandas_roundtrip(self, df, expected=None,
+                                timestamps_to_ms=False):
+        table = A.from_pandas_dataframe(df, timestamps_to_ms=timestamps_to_ms)
         result = table.to_pandas()
         if expected is None:
             expected = df
@@ -164,6 +165,25 @@ class TestPandasConversion(unittest.TestCase):
         expected = pd.DataFrame({'strings': values * repeats})
         self._check_pandas_roundtrip(df, expected)
 
+    def test_timestamps_notimezone(self):
+        df = pd.DataFrame({
+            'datetime64': np.array([
+                '2007-07-13T01:23:34.123',
+                '2006-01-13T12:34:56.432',
+                '2010-08-13T05:46:57.437'],
+                dtype='datetime64[ms]')
+            })
+        self._check_pandas_roundtrip(df, timestamps_to_ms=True)
+
+        df = pd.DataFrame({
+            'datetime64': np.array([
+                '2007-07-13T01:23:34.123456789',
+                '2006-01-13T12:34:56.432539784',
+                '2010-08-13T05:46:57.437699912'],
+                dtype='datetime64[ns]')
+            })
+        self._check_pandas_roundtrip(df, timestamps_to_ms=False)
+
     # def test_category(self):
     #     repeats = 1000
     #     values = [b'foo', None, u'bar', 'qux', np.nan]

http://git-wip-us.apache.org/repos/asf/arrow/blob/0a411fd2/python/pyarrow/tests/test_parquet.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index de9cfbb..d89d947 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -57,11 +57,13 @@ def test_pandas_parquet_2_0_rountrip(tmpdir):
         'float32': np.arange(size, dtype=np.float32),
         'float64': np.arange(size, dtype=np.float64),
         'bool': np.random.randn(size) > 0,
+        # Pandas only support ns resolution, Arrow at the moment only ms
+        'datetime': np.arange("2016-01-01T00:00:00.001", size, dtype='datetime64[ms]'),
         'str': [str(x) for x in range(size)],
         'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None]
     })
     filename = tmpdir.join('pandas_rountrip.parquet')
-    arrow_table = A.from_pandas_dataframe(df)
+    arrow_table = A.from_pandas_dataframe(df, timestamps_to_ms=True)
     A.parquet.write_table(arrow_table, filename.strpath, version="2.0")
     table_read = pyarrow.parquet.read_table(filename.strpath)
     df_read = table_read.to_pandas()

http://git-wip-us.apache.org/repos/asf/arrow/blob/0a411fd2/python/src/pyarrow/adapters/pandas.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc
index 8dcc2b1..a4e7fb6 100644
--- a/python/src/pyarrow/adapters/pandas.cc
+++ b/python/src/pyarrow/adapters/pandas.cc
@@ -38,6 +38,7 @@ namespace pyarrow {
 
 using arrow::Array;
 using arrow::Column;
+using arrow::DataType;
 namespace util = arrow::util;
 
 // ----------------------------------------------------------------------
@@ -50,7 +51,7 @@ struct npy_traits {
 template <>
 struct npy_traits<NPY_BOOL> {
   typedef uint8_t value_type;
-  using ArrayType = arrow::BooleanArray;
+  using TypeClass = arrow::BooleanType;
 
   static constexpr bool supports_nulls = false;
   static inline bool isnull(uint8_t v) {
@@ -62,7 +63,7 @@ struct npy_traits<NPY_BOOL> {
   template <>                                       \
   struct npy_traits<NPY_##TYPE> {                   \
     typedef T value_type;                           \
-    using ArrayType = arrow::CapType##Array;        \
+    using TypeClass = arrow::CapType##Type;         \
                                                     \
     static constexpr bool supports_nulls = false;   \
     static inline bool isnull(T v) {                \
@@ -82,7 +83,7 @@ NPY_INT_DECL(UINT64, UInt64, uint64_t);
 template <>
 struct npy_traits<NPY_FLOAT32> {
   typedef float value_type;
-  using ArrayType = arrow::FloatArray;
+  using TypeClass = arrow::FloatType;
 
   static constexpr bool supports_nulls = true;
 
@@ -94,7 +95,7 @@ struct npy_traits<NPY_FLOAT32> {
 template <>
 struct npy_traits<NPY_FLOAT64> {
   typedef double value_type;
-  using ArrayType = arrow::DoubleArray;
+  using TypeClass = arrow::DoubleType;
 
   static constexpr bool supports_nulls = true;
 
@@ -104,6 +105,22 @@ struct npy_traits<NPY_FLOAT64> {
 };
 
 template <>
+struct npy_traits<NPY_DATETIME> {
+  typedef double value_type;
+  using TypeClass = arrow::TimestampType;
+
+  static constexpr bool supports_nulls = true;
+
+  static inline bool isnull(int64_t v) {
+    // NaT = -2**63
+    // = -0x8000000000000000
+    // = -9223372036854775808;
+    // = std::numeric_limits<int64_t>::min()
+    return v == std::numeric_limits<int64_t>::min();
+  }
+};
+
+template <>
 struct npy_traits<NPY_OBJECT> {
   typedef PyObject* value_type;
   static constexpr bool supports_nulls = true;
@@ -206,6 +223,8 @@ class ArrowSerializer {
     return Status::OK();
   }
 
+  Status MakeDataType(std::shared_ptr<DataType>* out);
+
   arrow::MemoryPool* pool_;
 
   PyArrayObject* arr_;
@@ -254,6 +273,39 @@ static int64_t ValuesToBitmap(const void* data, int64_t length, uint8_t*
bitmap)
 }
 
 template <int TYPE>
+inline Status ArrowSerializer<TYPE>::MakeDataType(std::shared_ptr<DataType>*
out) {
+  out->reset(new typename npy_traits<TYPE>::TypeClass());
+  return Status::OK();
+}
+
+template <>
+inline Status ArrowSerializer<NPY_DATETIME>::MakeDataType(std::shared_ptr<DataType>*
out) {
+  PyArray_Descr* descr = PyArray_DESCR(arr_);
+  auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(descr->c_metadata);
+  arrow::TimestampType::Unit unit;
+
+  switch (date_dtype->meta.base) {
+      case NPY_FR_s:
+          unit = arrow::TimestampType::Unit::SECOND;
+          break;
+      case NPY_FR_ms:
+          unit = arrow::TimestampType::Unit::MILLI;
+          break;
+      case NPY_FR_us:
+          unit = arrow::TimestampType::Unit::MICRO;
+          break;
+      case NPY_FR_ns:
+          unit = arrow::TimestampType::Unit::NANO;
+          break;
+      default:
+          return Status::ValueError("Unknown NumPy datetime unit");
+  }
+
+  out->reset(new arrow::TimestampType(unit));
+  return Status::OK();
+}
+
+template <int TYPE>
 inline Status ArrowSerializer<TYPE>::Convert(std::shared_ptr<Array>* out) {
   typedef npy_traits<TYPE> traits;
 
@@ -269,9 +321,9 @@ inline Status ArrowSerializer<TYPE>::Convert(std::shared_ptr<Array>*
out) {
   }
 
   RETURN_NOT_OK(ConvertData());
-  *out = std::make_shared<typename traits::ArrayType>(length_, data_, null_count,
-      null_bitmap_);
-
+  std::shared_ptr<DataType> type;
+  RETURN_NOT_OK(MakeDataType(&type));
+  RETURN_ARROW_NOT_OK(MakePrimitiveArray(type, length_, data_, null_count, null_bitmap_,
out));
   return Status::OK();
 }
 
@@ -402,6 +454,7 @@ Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject*
mo,
     TO_ARROW_CASE(UINT64);
     TO_ARROW_CASE(FLOAT32);
     TO_ARROW_CASE(FLOAT64);
+    TO_ARROW_CASE(DATETIME);
     TO_ARROW_CASE(OBJECT);
     default:
       std::stringstream ss;
@@ -477,6 +530,17 @@ struct arrow_traits<arrow::Type::DOUBLE> {
 };
 
 template <>
+struct arrow_traits<arrow::Type::TIMESTAMP> {
+  static constexpr int npy_type = NPY_DATETIME;
+  static constexpr bool supports_nulls = true;
+  static constexpr int64_t na_value = std::numeric_limits<int64_t>::min();
+  static constexpr bool is_boolean = false;
+  static constexpr bool is_integer = true;
+  static constexpr bool is_floating = false;
+  typedef typename npy_traits<NPY_DATETIME>::value_type T;
+};
+
+template <>
 struct arrow_traits<arrow::Type::STRING> {
   static constexpr int npy_type = NPY_OBJECT;
   static constexpr bool supports_nulls = true;
@@ -494,6 +558,30 @@ static inline PyObject* make_pystring(const uint8_t* data, int32_t length)
{
 #endif
 }
 
+inline void set_numpy_metadata(int type, DataType* datatype, PyArrayObject* out) {
+  if (type == NPY_DATETIME) {
+    auto timestamp_type = static_cast<arrow::TimestampType*>(datatype);
+    // We only support ms resolution at the moment
+    PyArray_Descr* descr = PyArray_DESCR(out);
+    auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(descr->c_metadata);
+
+    switch (timestamp_type->unit) {
+      case arrow::TimestampType::Unit::SECOND:
+        date_dtype->meta.base = NPY_FR_s;
+        break;
+      case arrow::TimestampType::Unit::MILLI:
+        date_dtype->meta.base = NPY_FR_ms;
+        break;
+      case arrow::TimestampType::Unit::MICRO:
+        date_dtype->meta.base = NPY_FR_us;
+        break;
+      case arrow::TimestampType::Unit::NANO:
+        date_dtype->meta.base = NPY_FR_ns;
+        break;
+    }
+  }
+}
+
 template <int TYPE>
 class ArrowDeserializer {
  public:
@@ -522,6 +610,8 @@ class ArrowDeserializer {
       return Status::OK();
     }
 
+    set_numpy_metadata(type, col_->type().get(), out_);
+
     return Status::OK();
   }
 
@@ -538,6 +628,8 @@ class ArrowDeserializer {
       return Status::OK();
     }
 
+    set_numpy_metadata(type, col_->type().get(), out_);
+
     if (PyArray_SetBaseObject(out_, py_ref_) == -1) {
       // Error occurred, trust that SetBaseObject set the error state
       return Status::OK();
@@ -713,6 +805,7 @@ Status ArrowToPandas(const std::shared_ptr<Column>& col, PyObject*
py_ref,
     FROM_ARROW_CASE(FLOAT);
     FROM_ARROW_CASE(DOUBLE);
     FROM_ARROW_CASE(STRING);
+    FROM_ARROW_CASE(TIMESTAMP);
     default:
       return Status::NotImplemented("Arrow type reading not implemented");
   }


Mime
View raw message