Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id D2A46200C74 for ; Sun, 14 May 2017 17:23:32 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id C9771160BA9; Sun, 14 May 2017 15:23:32 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 1B147160BA6 for ; Sun, 14 May 2017 17:23:31 +0200 (CEST) Received: (qmail 68585 invoked by uid 500); 14 May 2017 15:23:31 -0000 Mailing-List: contact commits-help@arrow.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@arrow.apache.org Delivered-To: mailing list commits@arrow.apache.org Received: (qmail 68576 invoked by uid 99); 14 May 2017 15:23:31 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Sun, 14 May 2017 15:23:31 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id F40F7DFB91; Sun, 14 May 2017 15:23:30 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: uwe@apache.org To: commits@arrow.apache.org Message-Id: <5538fc0d827e46028e4707a57c5162b4@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: arrow git commit: ARROW-1022: [Python] Add multithreaded read option to read_feather Date: Sun, 14 May 2017 15:23:30 +0000 (UTC) archived-at: Sun, 14 May 2017 15:23:33 -0000 Repository: arrow Updated Branches: refs/heads/master 5739e04b3 -> d8d3d8435 ARROW-1022: [Python] Add multithreaded read option to read_feather Author: Wes McKinney Closes #682 from wesm/ARROW-1022 and squashes the following commits: 8fd241e [Wes McKinney] Add multithreaded read option to read_feather Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/d8d3d843 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/d8d3d843 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/d8d3d843 Branch: refs/heads/master Commit: d8d3d84354d827e45c8267cd05aecd2aa36cf60b Parents: 5739e04 Author: Wes McKinney Authored: Sun May 14 17:23:26 2017 +0200 Committer: Uwe L. Korn Committed: Sun May 14 17:23:26 2017 +0200 ---------------------------------------------------------------------- python/pyarrow/feather.py | 10 ++++++---- python/pyarrow/tests/test_feather.py | 11 +++++++++-- 2 files changed, 15 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/d8d3d843/python/pyarrow/feather.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py index 3754aec..34783a7 100644 --- a/python/pyarrow/feather.py +++ b/python/pyarrow/feather.py @@ -37,7 +37,7 @@ class FeatherReader(ext.FeatherReader): self.source = source self.open(source) - def read(self, columns=None): + def read(self, columns=None, nthreads=1): if columns is not None: column_set = set(columns) else: @@ -53,7 +53,7 @@ class FeatherReader(ext.FeatherReader): names.append(name) table = Table.from_arrays(columns, names=names) - return table.to_pandas() + return table.to_pandas(nthreads=nthreads) class FeatherWriter(object): @@ -118,7 +118,7 @@ def write_feather(df, dest): raise -def read_feather(source, columns=None): +def read_feather(source, columns=None, nthreads=1): """ Read a pandas.DataFrame from Feather format @@ -128,10 +128,12 @@ def read_feather(source, columns=None): columns : sequence, optional Only read a specific set of columns. If not provided, all columns are read + nthreads : int, default 1 + Number of CPU threads to use when reading to pandas.DataFrame Returns ------- df : pandas.DataFrame """ reader = FeatherReader(source) - return reader.read(columns=columns) + return reader.read(columns=columns, nthreads=nthreads) http://git-wip-us.apache.org/repos/asf/arrow/blob/d8d3d843/python/pyarrow/tests/test_feather.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index 69c32be..287e0da 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -61,7 +61,8 @@ class TestFeatherReader(unittest.TestCase): return counts def _check_pandas_roundtrip(self, df, expected=None, path=None, - columns=None, null_counts=None): + columns=None, null_counts=None, + nthreads=1): if path is None: path = random_path() @@ -70,7 +71,7 @@ class TestFeatherReader(unittest.TestCase): if not os.path.exists(path): raise Exception('file not written') - result = read_feather(path, columns) + result = read_feather(path, columns, nthreads=nthreads) if expected is None: expected = df @@ -293,6 +294,12 @@ class TestFeatherReader(unittest.TestCase): df = pd.DataFrame({'strings': [''] * 10}) self._check_pandas_roundtrip(df) + def test_multithreaded_read(self): + data = {'c{0}'.format(i): [''] * 10 + for i in range(100)} + df = pd.DataFrame(data) + self._check_pandas_roundtrip(df, nthreads=4) + def test_nan_as_null(self): # Create a nan that is not numpy.nan values = np.array(['foo', np.nan, np.nan * 2, 'bar'] * 10)