arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From u..@apache.org
Subject [arrow] branch master updated: ARROW-1830: [Python] Relax restriction that Parquet files in a dataset end in .parq or .parquet
Date Tue, 21 Nov 2017 13:35:28 GMT
This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new d887d91  ARROW-1830: [Python] Relax restriction that Parquet files in a dataset end
in .parq or .parquet
d887d91 is described below

commit d887d9122a17530126a40a60944e84207d40d412
Author: Wes McKinney <wes.mckinney@twosigma.com>
AuthorDate: Tue Nov 21 14:35:22 2017 +0100

    ARROW-1830: [Python] Relax restriction that Parquet files in a dataset end in .parq or
.parquet
    
    It seems that some setups may not use these file extensions, and so our assumption is
too rigid.
    
    cc @dbtsai
    
    Author: Wes McKinney <wes.mckinney@twosigma.com>
    
    Closes #1340 from wesm/ARROW-1830 and squashes the following commits:
    
    d123ae1 [Wes McKinney] Relax restriction that Parquet files in a dataset end in .parq
or .parquet
---
 python/pyarrow/parquet.py            | 12 ++++--------
 python/pyarrow/tests/test_parquet.py |  2 +-
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 3023e17..37da662 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -421,10 +421,6 @@ class ParquetDatasetPiece(object):
         return table
 
 
-def _is_parquet_file(path):
-    return path.endswith('parq') or path.endswith('parquet')
-
-
 class PartitionSet(object):
     """A data structure for cataloguing the observed Parquet partitions at a
     particular level. So if we have
@@ -556,14 +552,14 @@ class ParquetManifest(object):
         filtered_files = []
         for path in files:
             full_path = self.pathsep.join((base_path, path))
-            if _is_parquet_file(path):
-                filtered_files.append(full_path)
-            elif path.endswith('_common_metadata'):
+            if path.endswith('_common_metadata'):
                 self.common_metadata_path = full_path
             elif path.endswith('_metadata'):
                 self.metadata_path = full_path
-            elif not self._should_silently_exclude(path):
+            elif self._should_silently_exclude(path):
                 print('Ignoring path: {0}'.format(full_path))
+            else:
+                filtered_files.append(full_path)
 
         # ARROW-1079: Filter out "private" directories starting with underscore
         filtered_directories = [self.pathsep.join((base_path, x))
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 522815f..274ff45 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -1020,7 +1020,7 @@ def _generate_partition_directories(fs, base_dir, partition_spec, df):
 
             if level == DEPTH - 1:
                 # Generate example data
-                file_path = pjoin(level_dir, 'data.parq')
+                file_path = pjoin(level_dir, guid())
 
                 filtered_df = _filter_partition(df, this_part_keys)
                 part_table = pa.Table.from_pandas(filtered_df)

-- 
To stop receiving notification emails like this one, please contact
['"commits@arrow.apache.org" <commits@arrow.apache.org>'].

Mime
View raw message