kudu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From a...@apache.org
Subject [1/2] kudu git commit: KUDU-2052: use XFS_IOC_UNRESVSP64 ioctl to punch holes on xfs
Date Sat, 24 Jun 2017 02:52:55 GMT
Repository: kudu
Updated Branches:
  refs/heads/master 410763afa -> c96d6680b


KUDU-2052: use XFS_IOC_UNRESVSP64 ioctl to punch holes on xfs

There's not much to say beyond what's already in the bug report.

To test, I created a 128 GB loopback-mounted xfs filesystem and ran the new
benchmark on it. I included the results below. I also ran the entire Kudu
test suite on that filesystem.

  $ TEST_TMPDIR=/data/8/adar/xfs/ bin/env-test \
    --gtest_filter=*Benchmark* \
    --never_fsync=false \
    --unlock_unsafe_flags
  Time spent writing 1073741824 bytes to file: real 0.538s	user 0.006s	sys 0.533s
  Time spent syncing file: real 8.142s	user 0.002s	sys 0.193s
  Time spent punching first hole of size 10485760: real 0.131s	user 0.000s	sys 0.130s
  Time spent syncing file: real 0.000s	user 0.000s	sys 0.000s
  Time spent repunching 1000 holes of size 10485760: real 0.007s	user 0.004s	sys 0.003s

  $ TEST_TMPDIR=/data/8/adar/xfs/ bin/env-test \
    --gtest_filter=*Benchmark* \
    --never_fsync=false \
    --unlock_unsafe_flags \
    --unlock_experimental_flags \
    --env_use_ioctl_hole_punch_on_xfs=false
  Time spent writing 1073741824 bytes to file: real 0.555s	user 0.007s	sys 0.550s
  Time spent syncing file: real 8.241s	user 0.000s	sys 0.219s
  Time spent punching first hole of size 10485760: real 0.175s	user 0.000s	sys 0.148s
  Time spent syncing file: real 0.000s	user 0.000s	sys 0.000s
  Time spent repunching 1000 holes of size 10485760: real 27.719s	user 0.026s	sys 0.029s

Change-Id: Ia3f5478729594f2a7e4e441905995b6f5d207d51
Reviewed-on: http://gerrit.cloudera.org:8080/7269
Reviewed-by: Todd Lipcon <todd@apache.org>
Tested-by: Kudu Jenkins


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/ca8ed1d4
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/ca8ed1d4
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/ca8ed1d4

Branch: refs/heads/master
Commit: ca8ed1d48beaf492231c0a69d030bf22b473d007
Parents: 410763a
Author: Adar Dembo <adar@cloudera.com>
Authored: Thu Jun 22 16:41:10 2017 -0700
Committer: Adar Dembo <adar@cloudera.com>
Committed: Sat Jun 24 02:52:32 2017 +0000

----------------------------------------------------------------------
 src/kudu/util/env-test.cc  | 48 +++++++++++++++++++++
 src/kudu/util/env_posix.cc | 95 +++++++++++++++++++++++++++++++++--------
 2 files changed, 126 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/ca8ed1d4/src/kudu/util/env-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/util/env-test.cc b/src/kudu/util/env-test.cc
index bb8095c..f5bff38 100644
--- a/src/kudu/util/env-test.cc
+++ b/src/kudu/util/env-test.cc
@@ -33,6 +33,8 @@
 #include "kudu/util/env_util.h"
 #include "kudu/util/malloc.h"
 #include "kudu/util/path_util.h"
+#include "kudu/util/random.h"
+#include "kudu/util/random_util.h"
 #include "kudu/util/status.h"
 #include "kudu/util/stopwatch.h"
 #include "kudu/util/test_util.h"
@@ -347,6 +349,52 @@ TEST_F(TestEnv, TestHolePunch) {
   ASSERT_EQ(size_on_disk - punch_amount, new_size_on_disk);
 }
 
+TEST_F(TestEnv, TestHolePunchBenchmark) {
+  const int kFileSize = 1 * 1024 * 1024 * 1024;
+  const int kHoleSize = 10 * kOneMb;
+  const int kNumRuns = 1000;
+  if (!fallocate_punch_hole_supported_) {
+    LOG(INFO) << "hole punching not supported, skipping test";
+    return;
+  }
+  Random r(SeedRandom());
+
+  string test_path = GetTestPath("test");
+  unique_ptr<RWFile> file;
+  ASSERT_OK(env_->NewRWFile(test_path, &file));
+
+  // Initialize a scratch buffer with random data.
+  uint8_t scratch[kOneMb];
+  RandomString(&scratch, kOneMb, &r);
+
+  // Fill the file with sequences of the random data.
+  LOG_TIMING(INFO, Substitute("writing $0 bytes to file", kFileSize)) {
+    Slice slice(scratch, kOneMb);
+    for (int i = 0; i < kFileSize; i += kOneMb) {
+      ASSERT_OK(file->Write(i, slice));
+    }
+  }
+  LOG_TIMING(INFO, "syncing file") {
+    ASSERT_OK(file->Sync());
+  }
+
+  // Punch the first hole.
+  LOG_TIMING(INFO, Substitute("punching first hole of size $0", kHoleSize)) {
+    ASSERT_OK(file->PunchHole(0, kHoleSize));
+  }
+  LOG_TIMING(INFO, "syncing file") {
+    ASSERT_OK(file->Sync());
+  }
+
+  // Run the benchmark.
+  LOG_TIMING(INFO, Substitute("repunching $0 holes of size $1",
+                              kNumRuns, kHoleSize)) {
+    for (int i = 0; i < kNumRuns; i++) {
+      ASSERT_OK(file->PunchHole(0, kHoleSize));
+    }
+  }
+}
+
 TEST_F(TestEnv, TestTruncate) {
   LOG(INFO) << "Testing Truncate()";
   string test_path = GetTestPath("test_env_wf");

http://git-wip-us.apache.org/repos/asf/kudu/blob/ca8ed1d4/src/kudu/util/env_posix.cc
----------------------------------------------------------------------
diff --git a/src/kudu/util/env_posix.cc b/src/kudu/util/env_posix.cc
index 9f18de7..588b4b3 100644
--- a/src/kudu/util/env_posix.cc
+++ b/src/kudu/util/env_posix.cc
@@ -36,6 +36,7 @@
 #include "kudu/gutil/bind.h"
 #include "kudu/gutil/callback.h"
 #include "kudu/gutil/map-util.h"
+#include "kudu/gutil/once.h"
 #include "kudu/gutil/strings/split.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/util/atomic.h"
@@ -77,6 +78,21 @@
 #define FALLOC_FL_PUNCH_HOLE  0x02 /* de-allocates range */
 #endif
 
+
+#ifndef __APPLE__
+// These struct and ioctl definitions were copied verbatim from xfsprogs.
+typedef struct xfs_flock64 {
+        __s16           l_type;
+        __s16           l_whence;
+        __s64           l_start;
+        __s64           l_len;          /* len == 0 means until end of file */
+        __s32           l_sysid;
+        __u32           l_pid;
+        __s32           l_pad[4];       /* reserve area                     */
+} xfs_flock64_t;
+#define XFS_IOC_UNRESVSP64      _IOW ('X', 43, struct xfs_flock64)
+#endif
+
 // For platforms without fdatasync (like OS X)
 #ifndef fdatasync
 #define fdatasync fsync
@@ -126,6 +142,13 @@ DEFINE_bool(env_use_fsync, false,
 TAG_FLAG(env_use_fsync, advanced);
 TAG_FLAG(env_use_fsync, evolving);
 
+// See KUDU-2052 for details.
+DEFINE_bool(env_use_ioctl_hole_punch_on_xfs, true,
+            "Use the XFS_IOC_UNRESVSP64 ioctl instead of fallocate(2) to "
+            "punch holes on XFS filesystems.");
+TAG_FLAG(env_use_ioctl_hole_punch_on_xfs, advanced);
+TAG_FLAG(env_use_ioctl_hole_punch_on_xfs, experimental);
+
 DEFINE_bool(suicide_on_eio, true,
             "Kill the process if an I/O operation results in EIO. If false, "
             "I/O resulting in EIOs will return the status IOError and leave "
@@ -460,6 +483,23 @@ Status DoWriteV(int fd, const string& filename, uint64_t offset,
   return Status::OK();
 }
 
+Status DoIsOnXfsFilesystem(const string& path, bool* result) {
+#ifdef __APPLE__
+  *result = false;
+#else
+  struct statfs buf;
+  int ret;
+  RETRY_ON_EINTR(ret, statfs(path.c_str(), &buf));
+  if (ret == -1) {
+    return IOError(Substitute("statfs: $0", path), errno);
+  }
+  // This magic number isn't defined in any header but is the value of the
+  // US-ASCII string 'XFSB' expressed in hexadecimal.
+  *result = (buf.f_type == 0x58465342);
+#endif
+  return Status::OK();
+}
+
 class PosixSequentialFile: public SequentialFile {
  private:
   std::string filename_;
@@ -691,6 +731,7 @@ class PosixRWFile : public RWFile {
         fd_(fd),
         sync_on_close_(sync_on_close),
         pending_sync_(false),
+        is_on_xfs_(false),
         closed_(false) {}
 
   ~PosixRWFile() {
@@ -759,7 +800,27 @@ class PosixRWFile : public RWFile {
     TRACE_EVENT1("io", "PosixRWFile::PunchHole", "path", filename_);
     MAYBE_RETURN_EIO(filename_, IOError(Env::kInjectedFailureStatusMsg, EIO));
     ThreadRestrictions::AssertIOAllowed();
-    if (fallocate(fd_, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, length) < 0)
{
+
+    // KUDU-2052: xfs in el6 systems induces an fsync in the kernel whenever it
+    // performs a hole punch through the fallocate() syscall, even if the file
+    // range was already punched out. The older xfs-specific hole punching
+    // ioctl doesn't do this, despite eventually executing the same xfs code.
+    // To keep the code simple, we'll use this ioctl on any xfs system (not
+    // just on el6) and fallocate() on all other filesystems.
+    //
+    // Note: the cast to void* here (and back to PosixRWFile*, in InitIsOnXFS)
+    // is needed to avoid an undefined behavior warning from UBSAN.
+    once_.Init(&InitIsOnXFS, reinterpret_cast<void*>(this));
+    if (is_on_xfs_ && FLAGS_env_use_ioctl_hole_punch_on_xfs) {
+      xfs_flock64_t cmd;
+      memset(&cmd, 0, sizeof(cmd));
+      cmd.l_start = offset;
+      cmd.l_len = length;
+      if (ioctl(fd_, XFS_IOC_UNRESVSP64, &cmd) < 0) {
+        return IOError(filename_, errno);
+      }
+    } else if (fallocate(fd_, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+                         offset, length) < 0) {
       return IOError(filename_, errno);
     }
     return Status::OK();
@@ -902,11 +963,26 @@ class PosixRWFile : public RWFile {
   }
 
  private:
+  static void InitIsOnXFS(void* arg) {
+    PosixRWFile* rwf = reinterpret_cast<PosixRWFile*>(arg);
+    bool result;
+    Status s = DoIsOnXfsFilesystem(rwf->filename_, &result);
+    if (s.ok()) {
+      rwf->is_on_xfs_ = result;
+    } else {
+      KLOG_EVERY_N_SECS(WARNING, 1) <<
+          Substitute("Could not determine whether file is on xfs, assuming not: $0",
+                     s.ToString());
+    }
+  }
+
   const std::string filename_;
   const int fd_;
   const bool sync_on_close_;
 
+  GoogleOnceDynamic once_;
   AtomicBool pending_sync_;
+  bool is_on_xfs_;
   bool closed_;
 };
 
@@ -1535,22 +1611,7 @@ class PosixEnv : public Env {
     TRACE_EVENT1("io", "PosixEnv::IsOnXfsFilesystem", "path", path);
     MAYBE_RETURN_EIO(path, IOError(Env::kInjectedFailureStatusMsg, EIO));
     ThreadRestrictions::AssertIOAllowed();
-
-#ifdef __APPLE__
-    *result = false;
-#else
-    struct statfs buf;
-    int ret;
-    RETRY_ON_EINTR(ret, statfs(path.c_str(), &buf));
-    if (ret == -1) {
-      return IOError(Substitute("statfs: $0", path), errno);
-    }
-
-    // This magic number isn't defined in any header but is the value of the
-    // US-ASCII string 'XFSB' expressed in hexadecimal.
-    *result = (buf.f_type == 0x58465342);
-#endif
-    return Status::OK();
+    return DoIsOnXfsFilesystem(path, result);
   }
 
   virtual string GetKernelRelease() OVERRIDE {


Mime
View raw message