arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject [arrow] branch master updated: ARROW-1756: [Python] Fix large file read/write error
Date Sat, 04 Nov 2017 20:56:35 GMT
This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 62190d7  ARROW-1756: [Python] Fix large file read/write error
62190d7 is described below

commit 62190d7a5201cad6ae0b26d790942ffc8861eee9
Author: Licht-T <licht-t@outlook.jp>
AuthorDate: Sat Nov 4 16:56:27 2017 -0400

    ARROW-1756: [Python] Fix large file read/write error
    
    This is the part of [ARROW-1756](https://issues.apache.org/jira/browse/ARROW-1756).
    
    Author: Licht-T <licht-t@outlook.jp>
    Author: Wes McKinney <wes.mckinney@twosigma.com>
    
    Closes #1276 from Licht-T/fix-large-file-read-write-error and squashes the following commits:
    
    e21964a3 [Wes McKinney] Break in read IO loop when reaching EOF
    8a68756b [Wes McKinney] Minor code tweaks, fix clang documentation warnings
    81c19721 [Licht-T] TST: Add test for the large file read/write
    9b71afee [Licht-T] ENH: Convert errno to string error message
    fbb7eea6 [Licht-T] BUG: Fix large file read/write error
---
 cpp/src/arrow/buffer.h               |  2 +-
 cpp/src/arrow/compare.h              | 10 +++---
 cpp/src/arrow/io/file.cc             | 66 ++++++++++++++++++++++++++++++------
 python/pyarrow/tests/conftest.py     | 14 +++++++-
 python/pyarrow/tests/test_feather.py |  9 +++--
 5 files changed, 81 insertions(+), 20 deletions(-)

diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h
index 8e98906..7c5f617 100644
--- a/cpp/src/arrow/buffer.h
+++ b/cpp/src/arrow/buffer.h
@@ -340,7 +340,7 @@ Status AllocateResizableBuffer(MemoryPool* pool, const int64_t size,
 #ifndef ARROW_NO_DEPRECATED_API
 
 /// \brief Create Buffer referencing std::string memory
-/// \deprecated Since 0.8.0
+/// \note Deprecated since 0.8.0
 ///
 /// Warning: string instance must stay alive
 ///
diff --git a/cpp/src/arrow/compare.h b/cpp/src/arrow/compare.h
index 27176ed..df3386e 100644
--- a/cpp/src/arrow/compare.h
+++ b/cpp/src/arrow/compare.h
@@ -33,27 +33,27 @@ class Tensor;
 
 #ifndef ARROW_NO_DEPRECATED_API
 /// Returns true if the arrays are exactly equal
-/// \deprecated Since 0.8.0
+/// \note Deprecated since 0.8.0
 Status ARROW_EXPORT ArrayEquals(const Array& left, const Array& right, bool* are_equal);
 
-/// \deprecated Since 0.8.0
+/// \note Deprecated since 0.8.0
 Status ARROW_EXPORT TensorEquals(const Tensor& left, const Tensor& right,
                                  bool* are_equal);
 
 /// Returns true if the arrays are approximately equal. For non-floating point
 /// types, this is equivalent to ArrayEquals(left, right)
-/// \deprecated Since 0.8.0
+/// \note Deprecated since 0.8.0
 Status ARROW_EXPORT ArrayApproxEquals(const Array& left, const Array& right,
                                       bool* are_equal);
 
 /// Returns true if indicated equal-length segment of arrays is exactly equal
-/// \deprecated Since 0.8.0
+/// \note Deprecated since 0.8.0
 Status ARROW_EXPORT ArrayRangeEquals(const Array& left, const Array& right,
                                      int64_t start_idx, int64_t end_idx,
                                      int64_t other_start_idx, bool* are_equal);
 
 /// Returns true if the type metadata are exactly equal
-/// \deprecated Since 0.8.0
+/// \note Deprecated since 0.8.0
 Status ARROW_EXPORT TypeEquals(const DataType& left, const DataType& right,
                                bool* are_equal);
 #endif
diff --git a/cpp/src/arrow/io/file.cc b/cpp/src/arrow/io/file.cc
index 74c6c09..057cad1 100644
--- a/cpp/src/arrow/io/file.cc
+++ b/cpp/src/arrow/io/file.cc
@@ -22,6 +22,21 @@
 
 #define _FILE_OFFSET_BITS 64
 
+// define max read/write count
+#if defined(_MSC_VER)
+#define ARROW_MAX_IO_CHUNKSIZE INT32_MAX
+#else
+
+#ifdef __APPLE__
+// due to macOS bug, we need to set read/write max
+#define ARROW_MAX_IO_CHUNKSIZE INT32_MAX
+#else
+// see notes on Linux read/write manpage
+#define ARROW_MAX_IO_CHUNKSIZE 0x7ffff000
+#endif
+
+#endif
+
 #include "arrow/io/file.h"
 
 #if _WIN32 || _WIN64
@@ -238,39 +253,68 @@ static inline Status FileSeek(int fd, int64_t pos) {
   return Status::OK();
 }
 
-static inline Status FileRead(int fd, uint8_t* buffer, int64_t nbytes,
+static inline Status FileRead(const int fd, uint8_t* buffer, const int64_t nbytes,
                               int64_t* bytes_read) {
 #if defined(_MSC_VER)
-  if (nbytes > INT32_MAX) {
+  if (nbytes > ARROW_MAX_IO_CHUNKSIZE) {
     return Status::IOError("Unable to read > 2GB blocks yet");
   }
   *bytes_read = static_cast<int64_t>(_read(fd, buffer, static_cast<uint32_t>(nbytes)));
 #else
-  *bytes_read = static_cast<int64_t>(read(fd, buffer, static_cast<size_t>(nbytes)));
+  *bytes_read = 0;
+
+  while (*bytes_read != -1 && *bytes_read < nbytes) {
+    int64_t chunksize =
+        std::min(static_cast<int64_t>(ARROW_MAX_IO_CHUNKSIZE), nbytes - *bytes_read);
+    int64_t ret = static_cast<int64_t>(
+        read(fd, buffer + *bytes_read, static_cast<size_t>(chunksize)));
+
+    if (ret != -1) {
+      *bytes_read += ret;
+      if (ret < chunksize) {
+        // EOF
+        break;
+      }
+    } else {
+      *bytes_read = ret;
+    }
+  }
 #endif
 
   if (*bytes_read == -1) {
-    // TODO(wesm): errno to string
-    return Status::IOError("Error reading bytes from file");
+    return Status::IOError(std::string("Error reading bytes from file: ") +
+                           std::string(strerror(errno)));
   }
 
   return Status::OK();
 }
 
-static inline Status FileWrite(int fd, const uint8_t* buffer, int64_t nbytes) {
-  int ret;
+static inline Status FileWrite(const int fd, const uint8_t* buffer,
+                               const int64_t nbytes) {
+  int ret = 0;
 #if defined(_MSC_VER)
-  if (nbytes > INT32_MAX) {
+  if (nbytes > ARROW_MAX_IO_CHUNKSIZE) {
     return Status::IOError("Unable to write > 2GB blocks to file yet");
   }
   ret = static_cast<int>(_write(fd, buffer, static_cast<uint32_t>(nbytes)));
 #else
-  ret = static_cast<int>(write(fd, buffer, static_cast<size_t>(nbytes)));
+  int64_t bytes_written = 0;
+
+  while (ret != -1 && bytes_written < nbytes) {
+    int64_t chunksize =
+        std::min(static_cast<int64_t>(ARROW_MAX_IO_CHUNKSIZE), nbytes - bytes_written);
+    ret = static_cast<int>(
+        write(fd, buffer + bytes_written, static_cast<size_t>(chunksize)));
+
+    if (ret != -1) {
+      bytes_written += ret;
+    }
+  }
 #endif
 
   if (ret == -1) {
-    // TODO(wesm): errno to string
-    return Status::IOError("Error writing bytes to file");
+    return Status::IOError(std::string("Error writing bytes from file: ") +
+                           std::string(strerror(errno)));
   }
   return Status::OK();
 }
diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py
index c6bd6c9..e276822 100644
--- a/python/pyarrow/tests/conftest.py
+++ b/python/pyarrow/tests/conftest.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from pytest import skip
+from pytest import skip, mark
 
 
 groups = [
@@ -70,6 +70,18 @@ def pytest_addoption(parser):
                          default=False,
                          help=('Run only the {0} test group'.format(group)))
 
+    parser.addoption('--runslow', action='store_true',
+                     default=False, help='run slow tests')
+
+
+def pytest_collection_modifyitems(config, items):
+    if not config.getoption('--runslow'):
+        skip_slow = mark.skip(reason='need --runslow option to run')
+
+        for item in items:
+            if 'slow' in item.keywords:
+                item.add_marker(skip_slow)
+
 
 def pytest_runtest_setup(item):
     only_set = False
diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py
index 9e7fc88..b0764fd 100644
--- a/python/pyarrow/tests/test_feather.py
+++ b/python/pyarrow/tests/test_feather.py
@@ -50,7 +50,7 @@ class TestFeatherReader(unittest.TestCase):
                 pass
 
     def test_file_not_exist(self):
-        with self.assertRaises(pa.ArrowIOError):
+        with pytest.raises(pa.ArrowIOError):
             FeatherReader('test_invalid_file')
 
     def _get_null_counts(self, path, columns=None):
@@ -98,7 +98,7 @@ class TestFeatherReader(unittest.TestCase):
         def f():
             write_feather(df, path)
 
-        self.assertRaises(exc, f)
+        pytest.raises(exc, f)
 
     def test_num_rows_attr(self):
         df = pd.DataFrame({'foo': [1, 2, 3, 4, 5]})
@@ -466,3 +466,8 @@ class TestFeatherReader(unittest.TestCase):
         # non-strings
         df = pd.DataFrame({'a': ['a', 1, 2.0]})
         self._assert_error_on_write(df, ValueError)
+
+    @pytest.mark.slow
+    def test_large_dataframe(self):
+        df = pd.DataFrame({'A': np.arange(400000000)})
+        self._check_pandas_roundtrip(df)

-- 
To stop receiving notification emails like this one, please contact
['"commits@arrow.apache.org" <commits@arrow.apache.org>'].

Mime
View raw message