arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject arrow git commit: ARROW-573: [C++/Python] Implement IPC metadata handling for ordered dictionaries, pandas conversions
Date Tue, 01 Aug 2017 15:57:48 GMT
Repository: arrow
Updated Branches:
  refs/heads/master b8754eba4 -> aa1d753a7


ARROW-573: [C++/Python] Implement IPC metadata handling for ordered dictionaries, pandas conversions

This was an oversight in the IPC implementation and pandas conversion path, and has been fixed.

Author: Wes McKinney <wes.mckinney@twosigma.com>

Closes #922 from wesm/ARROW-573 and squashes the following commits:

458820e5 [Wes McKinney] Suppress C4800 in MSVC
46361f3f [Wes McKinney] Implement IPC metadata handling for ordered dictionaries, faithful
conversion to/from pandas.Categorical


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/aa1d753a
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/aa1d753a
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/aa1d753a

Branch: refs/heads/master
Commit: aa1d753a74b5517c0b20db8e5540786520b9956f
Parents: b8754eb
Author: Wes McKinney <wes.mckinney@twosigma.com>
Authored: Tue Aug 1 11:57:43 2017 -0400
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Tue Aug 1 11:57:43 2017 -0400

----------------------------------------------------------------------
 cpp/src/arrow/array-test.cc                 | 11 +++++++----
 cpp/src/arrow/compare.cc                    |  3 ++-
 cpp/src/arrow/ipc/metadata.cc               |  5 +++--
 cpp/src/arrow/ipc/test-common.h             |  2 +-
 cpp/src/arrow/type.cc                       |  7 ++++---
 cpp/src/arrow/type.h                        |  5 +++--
 python/CMakeLists.txt                       |  4 ++++
 python/pyarrow/array.pxi                    | 11 ++++++++---
 python/pyarrow/includes/libarrow.pxd        |  4 +++-
 python/pyarrow/pandas_compat.py             |  3 ++-
 python/pyarrow/tests/test_convert_pandas.py |  3 +++
 python/pyarrow/tests/test_ipc.py            | 22 ++++++++++++++++++----
 python/pyarrow/types.pxi                    | 11 +++++++++--
 13 files changed, 67 insertions(+), 24 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/aa1d753a/cpp/src/arrow/array-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index 0efb51c..57d2c8b 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -1881,15 +1881,18 @@ TEST(TestDictionary, Basics) {
 
   std::shared_ptr<DictionaryType> type1 =
       std::dynamic_pointer_cast<DictionaryType>(dictionary(int16(), dict));
-  DictionaryType type2(int16(), dict);
+
+  auto type2 =
+      std::dynamic_pointer_cast<DictionaryType>(::arrow::dictionary(int16(), dict,
true));
 
   ASSERT_TRUE(int16()->Equals(type1->index_type()));
   ASSERT_TRUE(type1->dictionary()->Equals(dict));
 
-  ASSERT_TRUE(int16()->Equals(type2.index_type()));
-  ASSERT_TRUE(type2.dictionary()->Equals(dict));
+  ASSERT_TRUE(int16()->Equals(type2->index_type()));
+  ASSERT_TRUE(type2->dictionary()->Equals(dict));
 
-  ASSERT_EQ("dictionary<values=int32, indices=int16>", type1->ToString());
+  ASSERT_EQ("dictionary<values=int32, indices=int16, ordered=0>", type1->ToString());
+  ASSERT_EQ("dictionary<values=int32, indices=int16, ordered=1>", type2->ToString());
 }
 
 TEST(TestDictionary, Equals) {

http://git-wip-us.apache.org/repos/asf/arrow/blob/aa1d753a/cpp/src/arrow/compare.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index dda5fdd..3a4a400 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -769,7 +769,8 @@ class TypeEqualsVisitor {
   Status Visit(const DictionaryType& left) {
     const auto& right = static_cast<const DictionaryType&>(right_);
     result_ = left.index_type()->Equals(right.index_type()) &&
-              left.dictionary()->Equals(right.dictionary());
+              left.dictionary()->Equals(right.dictionary()) &&
+              (left.ordered() == right.ordered());
     return Status::OK();
   }
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/aa1d753a/cpp/src/arrow/ipc/metadata.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/metadata.cc b/cpp/src/arrow/ipc/metadata.cc
index 20fd280..d764e20 100644
--- a/cpp/src/arrow/ipc/metadata.cc
+++ b/cpp/src/arrow/ipc/metadata.cc
@@ -492,7 +492,8 @@ static DictionaryOffset GetDictionaryEncoding(FBB& fbb, const DictionaryType&
ty
   auto index_type_offset = flatbuf::CreateInt(fbb, fw_index_type.bit_width(), true);
 
   // TODO(wesm): ordered dictionaries
-  return flatbuf::CreateDictionaryEncoding(fbb, dictionary_id, index_type_offset);
+  return flatbuf::CreateDictionaryEncoding(fbb, dictionary_id, index_type_offset,
+                                           type.ordered());
 }
 
 static Status FieldToFlatbuffer(FBB& fbb, const std::shared_ptr<Field>& field,
@@ -551,7 +552,7 @@ static Status FieldFromFlatbuffer(const flatbuf::Field* field,
 
     std::shared_ptr<DataType> index_type;
     RETURN_NOT_OK(IntFromFlatbuffer(encoding->indexType(), &index_type));
-    type = std::make_shared<DictionaryType>(index_type, dictionary);
+    type = ::arrow::dictionary(index_type, dictionary, encoding->isOrdered());
   }
   *out = std::make_shared<Field>(field->name()->str(), type, field->nullable());
   return Status::OK();

http://git-wip-us.apache.org/repos/asf/arrow/blob/aa1d753a/cpp/src/arrow/ipc/test-common.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h
index cb82737..76cc843 100644
--- a/cpp/src/arrow/ipc/test-common.h
+++ b/cpp/src/arrow/ipc/test-common.h
@@ -462,7 +462,7 @@ Status MakeDictionary(std::shared_ptr<RecordBatch>* out) {
   ArrayFromVector<StringType, std::string>(dict2_values, &dict2);
 
   auto f0_type = arrow::dictionary(arrow::int32(), dict1);
-  auto f1_type = arrow::dictionary(arrow::int8(), dict1);
+  auto f1_type = arrow::dictionary(arrow::int8(), dict1, true);
   auto f2_type = arrow::dictionary(arrow::int32(), dict2);
 
   std::shared_ptr<Array> indices0, indices1, indices2;

http://git-wip-us.apache.org/repos/asf/arrow/blob/aa1d753a/cpp/src/arrow/type.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index b8489d4..edf4d33 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -236,7 +236,7 @@ std::shared_ptr<Array> DictionaryType::dictionary() const { return
dictionary_;
 std::string DictionaryType::ToString() const {
   std::stringstream ss;
   ss << "dictionary<values=" << dictionary_->type()->ToString()
-     << ", indices=" << index_type_->ToString() << ">";
+     << ", indices=" << index_type_->ToString() << ", ordered=" <<
ordered_ << ">";
   return ss.str();
 }
 
@@ -428,8 +428,9 @@ std::shared_ptr<DataType> union_(const std::vector<std::shared_ptr<Field>>&
chil
 }
 
 std::shared_ptr<DataType> dictionary(const std::shared_ptr<DataType>& index_type,
-                                     const std::shared_ptr<Array>& dict_values)
{
-  return std::make_shared<DictionaryType>(index_type, dict_values);
+                                     const std::shared_ptr<Array>& dict_values,
+                                     bool ordered) {
+  return std::make_shared<DictionaryType>(index_type, dict_values, ordered);
 }
 
 std::shared_ptr<Field> field(const std::string& name,

http://git-wip-us.apache.org/repos/asf/arrow/blob/aa1d753a/cpp/src/arrow/type.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 45d97fd..b28fe92 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -785,8 +785,9 @@ std::shared_ptr<DataType> ARROW_EXPORT
 union_(const std::vector<std::shared_ptr<Field>>& child_fields,
        const std::vector<uint8_t>& type_codes, UnionMode mode = UnionMode::SPARSE);
 
-std::shared_ptr<DataType> ARROW_EXPORT dictionary(
-    const std::shared_ptr<DataType>& index_type, const std::shared_ptr<Array>&
values);
+std::shared_ptr<DataType> ARROW_EXPORT
+dictionary(const std::shared_ptr<DataType>& index_type,
+           const std::shared_ptr<Array>& values, bool ordered = false);
 
 std::shared_ptr<Field> ARROW_EXPORT field(
     const std::string& name, const std::shared_ptr<DataType>& type, bool nullable
= true,

http://git-wip-us.apache.org/repos/asf/arrow/blob/aa1d753a/python/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index bfae157..af95073 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -92,6 +92,10 @@ else()
   # Cython generates some bitshift expressions that MSVC does not like in
   # __Pyx_PyFloat_DivideObjC
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4293")
+
+  # Converting to/from C++ bool is pretty wonky in Cython. The C4800 warning
+  # seem harmless, and probably not worth the effort of working around it
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4800")
 endif()
 
 if ("${COMPILER_FAMILY}" STREQUAL "clang")

http://git-wip-us.apache.org/repos/asf/arrow/blob/aa1d753a/python/pyarrow/array.pxi
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 67418aa..f320cbe 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -189,7 +189,8 @@ cdef class Array:
         if isinstance(values, Categorical):
             return DictionaryArray.from_arrays(
                 values.codes, values.categories.values,
-                mask=mask, memory_pool=memory_pool)
+                mask=mask, ordered=values.ordered,
+                memory_pool=memory_pool)
         elif values.dtype == object:
             # Object dtype undergoes a different conversion path as more type
             # inference may be needed
@@ -564,7 +565,7 @@ cdef class DictionaryArray(Array):
             return self._indices
 
     @staticmethod
-    def from_arrays(indices, dictionary, mask=None,
+    def from_arrays(indices, dictionary, mask=None, ordered=False,
                     MemoryPool memory_pool=None):
         """
         Construct Arrow DictionaryArray from array of indices (must be
@@ -576,6 +577,8 @@ cdef class DictionaryArray(Array):
         dictionary : ndarray or pandas.Series
         mask : ndarray or pandas.Series, boolean type
             True values indicate that indices are actually null
+        ordered : boolean, default False
+            Set to True if the category values are ordered
 
         Returns
         -------
@@ -609,8 +612,10 @@ cdef class DictionaryArray(Array):
         if not isinstance(arrow_indices, IntegerArray):
             raise ValueError('Indices must be integer type')
 
+        cdef c_bool c_ordered = ordered
+
         c_type.reset(new CDictionaryType(arrow_indices.type.sp_type,
-                                         arrow_dictionary.sp_array))
+                                         arrow_dictionary.sp_array, c_ordered))
         c_result.reset(new CDictionaryArray(c_type, arrow_indices.sp_array))
 
         result = DictionaryArray()

http://git-wip-us.apache.org/repos/asf/arrow/blob/aa1d753a/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 8d7e279..a25d7a2 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -132,10 +132,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
 
     cdef cppclass CDictionaryType" arrow::DictionaryType"(CFixedWidthType):
         CDictionaryType(const shared_ptr[CDataType]& index_type,
-                        const shared_ptr[CArray]& dictionary)
+                        const shared_ptr[CArray]& dictionary,
+                        c_bool ordered)
 
         shared_ptr[CDataType] index_type()
         shared_ptr[CArray] dictionary()
+        c_bool ordered()
 
     shared_ptr[CDataType] ctimestamp" arrow::timestamp"(TimeUnit unit)
     shared_ptr[CDataType] ctimestamp" arrow::timestamp"(

http://git-wip-us.apache.org/repos/asf/arrow/blob/aa1d753a/python/pyarrow/pandas_compat.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index cd7ad47..62547a4 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -284,9 +284,10 @@ def table_to_blockmanager(table, nthreads=1):
         block_arr = item['block']
         placement = item['placement']
         if 'dictionary' in item:
+            ordered = block_table.schema[placement[0]].type.ordered
             cat = pd.Categorical(block_arr,
                                  categories=item['dictionary'],
-                                 ordered=False, fastpath=True)
+                                 ordered=ordered, fastpath=True)
             block = _int.make_block(cat, placement=placement,
                                     klass=_int.CategoricalBlock,
                                     fastpath=True)

http://git-wip-us.apache.org/repos/asf/arrow/blob/aa1d753a/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index d488658..f6ea163 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -536,6 +536,9 @@ class TestPandasConversion(unittest.TestCase):
         df = pd.DataFrame({'cat_strings': pd.Categorical(v1 * repeats),
                            'cat_ints': pd.Categorical(v2 * repeats),
                            'cat_binary': pd.Categorical(v3 * repeats),
+                           'cat_strings_ordered': pd.Categorical(
+                               v1 * repeats, categories=['bar', 'qux', 'foo'],
+                               ordered=True),
                            'ints': v2 * repeats,
                            'ints2': v2 * repeats,
                            'strings': v1 * repeats,

http://git-wip-us.apache.org/repos/asf/arrow/blob/aa1d753a/python/pyarrow/tests/test_ipc.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py
index 3ad369c..120a982 100644
--- a/python/pyarrow/tests/test_ipc.py
+++ b/python/pyarrow/tests/test_ipc.py
@@ -40,22 +40,20 @@ class MessagingTest(object):
     def _get_source(self):
         return self.sink.getvalue()
 
-    def write_batches(self):
+    def write_batches(self, num_batches=5):
         nrows = 5
         df = pd.DataFrame({
             'one': np.random.randn(nrows),
             'two': ['foo', np.nan, 'bar', 'bazbaz', 'qux']})
-
         batch = pa.RecordBatch.from_pandas(df)
 
         writer = self._get_writer(self.sink, batch.schema)
 
-        num_batches = 5
         frames = []
         batches = []
         for i in range(num_batches):
             unique_df = df.copy()
-            unique_df['one'] = np.random.randn(nrows)
+            unique_df['one'] = np.random.randn(len(df))
 
             batch = pa.RecordBatch.from_pandas(unique_df)
             writer.write_batch(batch)
@@ -122,6 +120,22 @@ class TestStream(MessagingTest, unittest.TestCase):
         with pytest.raises(pa.ArrowInvalid):
             pa.open_stream(buf)
 
+    def test_categorical_roundtrip(self):
+        df = pd.DataFrame({
+            'one': np.random.randn(5),
+            'two': pd.Categorical(['foo', np.nan, 'bar', 'foo', 'foo'],
+                                  categories=['foo', 'bar'],
+                                  ordered=True)
+        })
+        batch = pa.RecordBatch.from_pandas(df)
+        writer = self._get_writer(self.sink, batch.schema)
+        writer.write_batch(pa.RecordBatch.from_pandas(df))
+        writer.close()
+
+        table = (pa.open_stream(pa.BufferReader(self._get_source()))
+                 .read_all())
+        assert_frame_equal(table.to_pandas(), df)
+
     def test_simple_roundtrip(self):
         _, batches = self.write_batches()
         file_contents = pa.BufferReader(self._get_source())

http://git-wip-us.apache.org/repos/asf/arrow/blob/aa1d753a/python/pyarrow/types.pxi
----------------------------------------------------------------------
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index fefde55..ad2f336 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -97,6 +97,11 @@ cdef class DictionaryType(DataType):
         DataType.init(self, type)
         self.dict_type = <const CDictionaryType*> type.get()
 
+    property ordered:
+
+        def __get__(self):
+            return self.dict_type.ordered()
+
 
 cdef class ListType(DataType):
 
@@ -798,7 +803,8 @@ cpdef ListType list_(value_type):
     return out
 
 
-cpdef DictionaryType dictionary(DataType index_type, Array dictionary):
+cpdef DictionaryType dictionary(DataType index_type, Array dictionary,
+                                bint ordered=False):
     """
     Dictionary (categorical, or simply encoded) type
 
@@ -814,7 +820,8 @@ cpdef DictionaryType dictionary(DataType index_type, Array dictionary):
     cdef DictionaryType out = DictionaryType()
     cdef shared_ptr[CDataType] dict_type
     dict_type.reset(new CDictionaryType(index_type.sp_type,
-                                        dictionary.sp_array))
+                                        dictionary.sp_array,
+                                        ordered == 1))
     out.init(dict_type)
     return out
 


Mime
View raw message