arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From u..@apache.org
Subject arrow git commit: ARROW-545: [Python] Ignore non .parq/.parquet files when reading directories as Parquet datasets
Date Thu, 09 Feb 2017 12:47:17 GMT
Repository: arrow
Updated Branches:
  refs/heads/master 0bdfd5efb -> 31f145dc5


ARROW-545: [Python] Ignore non .parq/.parquet files when reading directories as Parquet datasets

Author: Wes McKinney <wes.mckinney@twosigma.com>

Closes #331 from wesm/ARROW-545 and squashes the following commits:

5494167 [Wes McKinney] Docstring typo
92b274c [Wes McKinney] Ignore non .parq/.parquet files when reading directories-as-Parquet-datasets


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/31f145dc
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/31f145dc
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/31f145dc

Branch: refs/heads/master
Commit: 31f145dc5296d27cc8010a4cd17ca5b4ae461dff
Parents: 0bdfd5e
Author: Wes McKinney <wes.mckinney@twosigma.com>
Authored: Thu Feb 9 13:47:09 2017 +0100
Committer: Uwe L. Korn <uwelk@xhochy.com>
Committed: Thu Feb 9 13:47:09 2017 +0100

----------------------------------------------------------------------
 python/pyarrow/__init__.py           |  2 +-
 python/pyarrow/filesystem.py         | 23 +++++++++++++++++------
 python/pyarrow/parquet.py            | 18 ++++++++++++++++--
 python/pyarrow/tests/test_parquet.py |  4 ++++
 4 files changed, 38 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/31f145dc/python/pyarrow/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index ea4710d..6724b52 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -67,4 +67,4 @@ from pyarrow.schema import (null, bool_,
 from pyarrow.table import Column, RecordBatch, Table, concat_tables
 
 
-localfs = LocalFilesystem()
+localfs = LocalFilesystem.get_instance()

http://git-wip-us.apache.org/repos/asf/arrow/blob/31f145dc/python/pyarrow/filesystem.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/filesystem.py b/python/pyarrow/filesystem.py
index 82409b7..55bcad0 100644
--- a/python/pyarrow/filesystem.py
+++ b/python/pyarrow/filesystem.py
@@ -62,7 +62,7 @@ class Filesystem(object):
         """
         raise NotImplementedError
 
-    def read_parquet(self, path, columns=None, schema=None):
+    def read_parquet(self, path, columns=None, metadata=None, schema=None):
         """
         Read Parquet data from path in file system. Can read from a single file
         or a directory of files
@@ -73,8 +73,11 @@ class Filesystem(object):
             Single file path or directory
         columns : List[str], optional
             Subset of columns to read
+        metadata : pyarrow.parquet.FileMetaData
+            Known metadata to validate files against
         schema : pyarrow.parquet.Schema
-            Known schema to validate files against
+            Known schema to validate files against. Alternative to metadata
+            argument
 
         Returns
         -------
@@ -85,18 +88,26 @@ class Filesystem(object):
         if self.isdir(path):
             paths_to_read = []
             for path in self.ls(path):
-                if path == '_metadata' or path == '_common_metadata':
-                    raise ValueError('No support yet for common metadata file')
-                paths_to_read.append(path)
+                if path.endswith('parq') or path.endswith('parquet'):
+                    paths_to_read.append(path)
         else:
             paths_to_read = [path]
 
         return read_multiple_files(paths_to_read, columns=columns,
-                                   filesystem=self, schema=schema)
+                                   filesystem=self, schema=schema,
+                                   metadata=metadata)
 
 
 class LocalFilesystem(Filesystem):
 
+    _instance = None
+
+    @classmethod
+    def get_instance(cls):
+        if cls._instance is None:
+            cls._instance = LocalFilesystem()
+        return cls._instance
+
     @implements(Filesystem.ls)
     def ls(self, path):
         return sorted(pjoin(path, x) for x in os.listdir(path))

http://git-wip-us.apache.org/repos/asf/arrow/blob/31f145dc/python/pyarrow/parquet.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 6654b77..9766ff6 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -15,12 +15,17 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import six
+
 from pyarrow._parquet import (ParquetReader, FileMetaData,  # noqa
                               RowGroupMetaData, Schema, ParquetWriter)
 import pyarrow._parquet as _parquet  # noqa
 from pyarrow.table import concat_tables
 
 
+EXCLUDED_PARQUET_PATHS = {'_metadata', '_common_metadata', '_SUCCESS'}
+
+
 class ParquetFile(object):
     """
     Open a Parquet binary file for reading
@@ -82,8 +87,9 @@ def read_table(source, columns=None, nthreads=1, metadata=None):
     Parameters
     ----------
     source: str or pyarrow.io.NativeFile
-        Readable source. For passing Python file objects or byte buffers, see
-        pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader.
+        Location of Parquet dataset. If a string passed, can be a single file
+        name or directory name. For passing Python file objects or byte
+        buffers, see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader.
     columns: list
         If not None, only these columns will be read from the file.
     nthreads : int, default 1
@@ -97,6 +103,14 @@ def read_table(source, columns=None, nthreads=1, metadata=None):
     pyarrow.Table
         Content of the file as a table (of columns)
     """
+    from pyarrow.filesystem import LocalFilesystem
+
+    if isinstance(source, six.string_types):
+        fs = LocalFilesystem.get_instance()
+        if fs.isdir(source):
+            return fs.read_parquet(source, columns=columns,
+                                   metadata=metadata)
+
     pf = ParquetFile(source, metadata=metadata)
     return pf.read(columns=columns, nthreads=nthreads)
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/31f145dc/python/pyarrow/tests/test_parquet.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 80a995f..969f68b 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -393,6 +393,10 @@ def test_read_multiple_files(tmpdir):
         test_data.append(table)
         paths.append(path)
 
+    # Write a _SUCCESS.crc file
+    with open(pjoin(dirpath, '_SUCCESS.crc'), 'wb') as f:
+        f.write(b'0')
+
     result = pq.read_multiple_files(paths)
     expected = pa.concat_tables(test_data)
 


Mime
View raw message