Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id EB400200CD7 for ; Tue, 18 Jul 2017 02:34:58 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id E9B14166163; Tue, 18 Jul 2017 00:34:58 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 3BB0416615C for ; Tue, 18 Jul 2017 02:34:58 +0200 (CEST) Received: (qmail 45709 invoked by uid 500); 18 Jul 2017 00:34:57 -0000 Mailing-List: contact commits-help@arrow.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@arrow.apache.org Delivered-To: mailing list commits@arrow.apache.org Received: (qmail 45700 invoked by uid 99); 18 Jul 2017 00:34:57 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 18 Jul 2017 00:34:57 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id B0587DFF8A; Tue, 18 Jul 2017 00:34:56 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: wesm@apache.org To: commits@arrow.apache.org Message-Id: X-Mailer: ASF-Git Admin Mailer Subject: arrow git commit: ARROW-1079: [Python] Filter out private directories when building Parquet dataset manifest Date: Tue, 18 Jul 2017 00:34:56 +0000 (UTC) archived-at: Tue, 18 Jul 2017 00:34:59 -0000 Repository: arrow Updated Branches: refs/heads/master b4d34f8fd -> a1c8b83b4 ARROW-1079: [Python] Filter out private directories when building Parquet dataset manifest Some systems like Hive and Impala use special files or directories to signal to other readers that a dataset modification is in progress. If such directories (starting with an underscore) exist in a flat Parquet directory, it currently breaks the dataset reader. Author: Wes McKinney Closes #860 from wesm/ARROW-1079 and squashes the following commits: c1c445b4 [Wes McKinney] Filter out private directories when building Parquet dataset manifest Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/a1c8b83b Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/a1c8b83b Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/a1c8b83b Branch: refs/heads/master Commit: a1c8b83b49192230bd2c91bd009e2ff272d89310 Parents: b4d34f8 Author: Wes McKinney Authored: Mon Jul 17 20:34:51 2017 -0400 Committer: Wes McKinney Committed: Mon Jul 17 20:34:51 2017 -0400 ---------------------------------------------------------------------- python/pyarrow/parquet.py | 9 ++++++++ python/pyarrow/tests/test_parquet.py | 35 +++++++++++++++++++++++++++++-- 2 files changed, 42 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/a1c8b83b/python/pyarrow/parquet.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index dc26dab..aa2352c 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +import os import json import six @@ -414,6 +415,9 @@ class ParquetManifest(object): elif fs.isdir(path): directories.append(path) + # ARROW-1079: Filter out "private" directories starting with underscore + directories = [x for x in directories if not _is_private_directory(x)] + if len(files) > 0 and len(directories) > 0: raise ValueError('Found files in an intermediate ' 'directory: {0}'.format(base_path)) @@ -456,6 +460,11 @@ def _parse_hive_partition(value): return value.split('=', 1) +def _is_private_directory(x): + _, tail = os.path.split(x) + return tail.startswith('_') and '=' not in tail + + def _path_split(path, sep): i = path.rfind(sep) + 1 head, tail = path[:i], path[i:] http://git-wip-us.apache.org/repos/asf/arrow/blob/a1c8b83b/python/pyarrow/tests/test_parquet.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index f606a7f..0f44d16 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -36,9 +36,14 @@ import pandas.util.testing as tm parquet = pytest.mark.parquet -def _write_table(*args, **kwargs): +def _write_table(table, path, **kwargs): import pyarrow.parquet as pq - return pq.write_table(*args, **kwargs) + + if isinstance(table, pd.DataFrame): + table = pa.Table.from_pandas(table) + + pq.write_table(table, path, **kwargs) + return table def _read_table(*args, **kwargs): @@ -852,6 +857,32 @@ def test_read_multiple_files(tmpdir): @parquet +def test_ignore_private_directories(tmpdir): + import pyarrow.parquet as pq + + nfiles = 10 + size = 5 + + dirpath = tmpdir.join(guid()).strpath + os.mkdir(dirpath) + + test_data = [] + paths = [] + for i in range(nfiles): + df = _test_dataframe(size, seed=i) + path = pjoin(dirpath, '{0}.parquet'.format(i)) + + test_data.append(_write_table(df, path)) + paths.append(path) + + # private directory + os.mkdir(pjoin(dirpath, '_impala_staging')) + + dataset = pq.ParquetDataset(dirpath) + assert set(paths) == set(x.path for x in dataset.pieces) + + +@parquet def test_multiindex_duplicate_values(tmpdir): num_rows = 3 numbers = list(range(num_rows))