From "Marco Neumann (JIRA)" <>
Subject [jira] [Created] (ARROW-5028) Arrow->Parquet store drops and corrupts values
Date Wed, 27 Mar 2019 14:25:00 GMT
Marco Neumann created ARROW-5028:

             Summary: Arrow->Parquet store drops and corrupts values
                 Key: ARROW-5028
             Project: Apache Arrow
          Issue Type: Bug
          Components: Python
    Affects Versions: 0.11.1, 0.13.0
            Reporter: Marco Neumann
         Attachments: dct.pickle.gz

I am sorry if this bugs feels rather long and the reproduction data is large, but I was not
able to reduce the data even further while still triggering the problem. I was able to trigger
this behavior on master and on {{0.11.1}}.

import io
import os.path
import pickle

import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq

def dct_to_table(index_dct):
    labeled_array = pa.array(np.array(list(index_dct.keys())))
    partition_array = pa.array(np.array(list(index_dct.values())))

    return pa.Table.from_arrays(
        [labeled_array, partition_array], names=['a', 'b']

def check_pq_nulls(data):
    fp = io.BytesIO(data)
    pfile = pq.ParquetFile(fp)
    assert pfile.num_row_groups == 1
    md = pfile.metadata.row_group(0)
    col = md.column(1)
    assert col.path_in_schema == 'b.list.item'
    assert col.statistics.null_count == 0  # fails

def roundtrip(table):
    buf = pa.BufferOutputStream()
    pq.write_table(table, buf)

    data = buf.getvalue().to_pybytes()

    # this fails:
    #   check_pq_nulls(data)

    reader = pa.BufferReader(data)
    return pq.read_table(reader)

with open(os.path.join(os.path.dirname(__file__), 'dct.pickle'), 'rb') as fp:
    dct = pickle.load(fp)

# this does NOT help:
#   pa.set_cpu_count(1)
#   import gc; gc.disable()

table = dct_to_table(dct)

# this fixes the issue:
#   table = pa.Table.from_pandas(table.to_pandas())

table2 = roundtrip(table)

assert table.column('b').null_count == 0
assert table2.column('b').null_count == 0  # fails

# if table2 is converted to pandas, you can also observe that some values at the end of column
b are `['']` which clearly is not present in the original data

I would also be thankful for any pointers on where the bug comes from or on who to reduce
the test case.

