kudu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mpe...@apache.org
Subject [1/2] kudu git commit: disk failure: randomized tserver test
Date Wed, 29 Nov 2017 06:23:24 GMT
Repository: kudu
Updated Branches:
  refs/heads/master 27da8323d -> d74b98fbb


disk failure: randomized tserver test

This patch adds a test that tests various paths that perform block IO by
manually applying randomized operations to a tablet and injecting disk
failures a fraction of the time. Eventually, the replica is marked as
failed, completing the test. This patch doesn't test the behavior after
reaching this failed state, but rather ensures this state is reachable
without crashing the server.

Change-Id: I17aca35fc0bf9c7b1b73308def865b0ed48d07fa
Reviewed-on: http://gerrit.cloudera.org:8080/8387
Reviewed-by: Mike Percy <mpercy@apache.org>
Tested-by: Kudu Jenkins


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/a18710cb
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/a18710cb
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/a18710cb

Branch: refs/heads/master
Commit: a18710cba8cd44f6bc759ebf93e6c6bbca25be53
Parents: 27da832
Author: Andrew Wong <awong@cloudera.com>
Authored: Wed Nov 22 11:44:05 2017 -0800
Committer: Andrew Wong <awong@cloudera.com>
Committed: Wed Nov 29 04:07:34 2017 +0000

----------------------------------------------------------------------
 .../integration-tests/cluster_itest_util.cc     |   3 +-
 src/kudu/integration-tests/cluster_itest_util.h |  17 +--
 src/kudu/tserver/tablet_server-test-base.h      |  14 +--
 src/kudu/tserver/tablet_server-test.cc          | 117 ++++++++++++++++++-
 src/kudu/tserver/tablet_server_test_util.cc     |  10 +-
 src/kudu/tserver/tablet_server_test_util.h      |  17 +--
 6 files changed, 142 insertions(+), 36 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/a18710cb/src/kudu/integration-tests/cluster_itest_util.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/cluster_itest_util.cc b/src/kudu/integration-tests/cluster_itest_util.cc
index ad3bd8c..ddbe798 100644
--- a/src/kudu/integration-tests/cluster_itest_util.cc
+++ b/src/kudu/integration-tests/cluster_itest_util.cc
@@ -94,6 +94,7 @@ using rpc::RpcController;
 using std::min;
 using std::shared_ptr;
 using std::string;
+using std::unique_ptr;
 using std::unordered_map;
 using std::vector;
 using strings::Substitute;
@@ -298,7 +299,7 @@ Status CreateTabletServerMap(const shared_ptr<MasterServiceProxy>&
master_proxy,
     vector<Sockaddr> addresses;
     host_port.ResolveAddresses(&addresses);
 
-    gscoped_ptr<TServerDetails> peer(new TServerDetails);
+    unique_ptr<TServerDetails> peer(new TServerDetails);
     peer->instance_id.CopyFrom(entry.instance_id());
     peer->registration.CopyFrom(entry.registration());
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/a18710cb/src/kudu/integration-tests/cluster_itest_util.h
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/cluster_itest_util.h b/src/kudu/integration-tests/cluster_itest_util.h
index 23481a5..e9e13cc 100644
--- a/src/kudu/integration-tests/cluster_itest_util.h
+++ b/src/kudu/integration-tests/cluster_itest_util.h
@@ -23,9 +23,7 @@
 // cluster type if it's general enough to use from multiple tests while not
 // belonging in the InternalMiniCluster / ExternalMiniCluster classes themselves. But
 // consider just putting stuff like that in those classes.
-
-#ifndef KUDU_INTEGRATION_TESTS_CLUSTER_ITEST_UTIL_H_
-#define KUDU_INTEGRATION_TESTS_CLUSTER_ITEST_UTIL_H_
+#pragma once
 
 #include <cstdint>
 #include <memory>
@@ -39,7 +37,6 @@
 #include "kudu/consensus/consensus.pb.h"
 #include "kudu/consensus/consensus.proxy.h"
 #include "kudu/consensus/metadata.pb.h"
-#include "kudu/gutil/gscoped_ptr.h"
 #include "kudu/server/server_base.proxy.h"
 #include "kudu/tablet/metadata.pb.h"
 #include "kudu/tserver/tablet_copy.pb.h"
@@ -79,11 +76,11 @@ namespace itest {
 struct TServerDetails {
   NodeInstancePB instance_id;
   ServerRegistrationPB registration;
-  gscoped_ptr<tserver::TabletCopyServiceProxy> tablet_copy_proxy;
-  gscoped_ptr<tserver::TabletServerServiceProxy> tserver_proxy;
-  gscoped_ptr<tserver::TabletServerAdminServiceProxy> tserver_admin_proxy;
-  gscoped_ptr<consensus::ConsensusServiceProxy> consensus_proxy;
-  gscoped_ptr<server::GenericServiceProxy> generic_proxy;
+  std::unique_ptr<tserver::TabletCopyServiceProxy> tablet_copy_proxy;
+  std::unique_ptr<tserver::TabletServerServiceProxy> tserver_proxy;
+  std::unique_ptr<tserver::TabletServerAdminServiceProxy> tserver_admin_proxy;
+  std::unique_ptr<consensus::ConsensusServiceProxy> consensus_proxy;
+  std::unique_ptr<server::GenericServiceProxy> generic_proxy;
 
   // Convenience function to get the UUID from the instance_id struct.
   const std::string& uuid() const;
@@ -418,5 +415,3 @@ Status GetInt64Metric(const HostPort& http_hp,
 
 } // namespace itest
 } // namespace kudu
-
-#endif // KUDU_INTEGRATION_TESTS_CLUSTER_ITEST_UTIL_H_

http://git-wip-us.apache.org/repos/asf/kudu/blob/a18710cb/src/kudu/tserver/tablet_server-test-base.h
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/tablet_server-test-base.h b/src/kudu/tserver/tablet_server-test-base.h
index a34c9b5..8aa793e 100644
--- a/src/kudu/tserver/tablet_server-test-base.h
+++ b/src/kudu/tserver/tablet_server-test-base.h
@@ -125,17 +125,17 @@ class TabletServerTestBase : public KuduTest {
 
   const Schema schema_;
   Schema key_schema_;
-  gscoped_ptr<RowBuilder> rb_;
+  std::unique_ptr<RowBuilder> rb_;
 
   std::shared_ptr<rpc::Messenger> client_messenger_;
 
-  gscoped_ptr<MiniTabletServer> mini_server_;
+  std::unique_ptr<MiniTabletServer> mini_server_;
   scoped_refptr<tablet::TabletReplica> tablet_replica_;
-  gscoped_ptr<TabletCopyServiceProxy> tablet_copy_proxy_;
-  gscoped_ptr<TabletServerServiceProxy> proxy_;
-  gscoped_ptr<TabletServerAdminServiceProxy> admin_proxy_;
-  gscoped_ptr<consensus::ConsensusServiceProxy> consensus_proxy_;
-  gscoped_ptr<server::GenericServiceProxy> generic_proxy_;
+  std::unique_ptr<TabletCopyServiceProxy> tablet_copy_proxy_;
+  std::unique_ptr<TabletServerServiceProxy> proxy_;
+  std::unique_ptr<TabletServerAdminServiceProxy> admin_proxy_;
+  std::unique_ptr<consensus::ConsensusServiceProxy> consensus_proxy_;
+  std::unique_ptr<server::GenericServiceProxy> generic_proxy_;
 
   MetricRegistry ts_test_metric_registry_;
   scoped_refptr<MetricEntity> ts_test_metric_entity_;

http://git-wip-us.apache.org/repos/asf/kudu/blob/a18710cb/src/kudu/tserver/tablet_server-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/tablet_server-test.cc b/src/kudu/tserver/tablet_server-test.cc
index 854bd1c..eb037e0 100644
--- a/src/kudu/tserver/tablet_server-test.cc
+++ b/src/kudu/tserver/tablet_server-test.cc
@@ -17,12 +17,13 @@
 
 #include "kudu/tserver/tablet_server-test-base.h"
 
+#include <stdlib.h>
 #include <unistd.h>
 
 #include <cstdint>
 #include <memory>
+#include <set>
 #include <sstream>
-#include <stddef.h>
 #include <string>
 #include <utility>
 #include <vector>
@@ -75,6 +76,7 @@
 #include "kudu/tserver/mini_tablet_server.h"
 #include "kudu/tserver/scanners.h"
 #include "kudu/tserver/tablet_server.h"
+#include "kudu/tserver/tablet_server_options.h"
 #include "kudu/tserver/tablet_server_test_util.h"
 #include "kudu/tserver/ts_tablet_manager.h"
 #include "kudu/tserver/tserver.pb.h"
@@ -114,6 +116,7 @@ using kudu::tablet::RowSetDataPB;
 using kudu::tablet::Tablet;
 using kudu::tablet::TabletReplica;
 using kudu::tablet::TabletSuperBlockPB;
+using std::set;
 using std::shared_ptr;
 using std::string;
 using std::unique_ptr;
@@ -131,12 +134,19 @@ DEFINE_int32(single_threaded_insert_latency_bench_insert_rows, 1000,
 DEFINE_int32(delete_tablet_bench_num_flushes, 200,
              "Number of disk row sets to flush in the delete tablet benchmark");
 
+DECLARE_bool(crash_on_eio);
+DECLARE_bool(enable_maintenance_manager);
 DECLARE_bool(fail_dns_resolution);
+DECLARE_double(env_inject_eio);
+DECLARE_int32(flush_threshold_mb);
+DECLARE_int32(flush_threshold_secs);
+DECLARE_int32(maintenance_manager_num_threads);
 DECLARE_int32(metrics_retirement_age_ms);
 DECLARE_int32(scanner_batch_size_rows);
 DECLARE_int32(scanner_gc_check_interval_us);
 DECLARE_int32(scanner_ttl_ms);
 DECLARE_string(block_manager);
+DECLARE_string(env_inject_eio_globs);
 
 // Declare these metrics prototypes for simpler unit testing of their behavior.
 METRIC_DECLARE_counter(rows_inserted);
@@ -348,6 +358,111 @@ TEST_F(TabletServerTest, TestWebPages) {
 #endif
 }
 
+class TabletServerDiskFailureTest : public TabletServerTestBase {
+ public:
+  virtual void SetUp() override {
+    const int kNumDirs = 5;
+    NO_FATALS(TabletServerTestBase::SetUp());
+    // Ensure the server will flush frequently.
+    FLAGS_enable_maintenance_manager = true;
+    FLAGS_maintenance_manager_num_threads = kNumDirs;
+    FLAGS_flush_threshold_mb = 1;
+    FLAGS_flush_threshold_secs = 1;
+
+    // Create a brand new tablet server with multiple disks, ensuring it can
+    // survive at least one disk failure.
+    NO_FATALS(StartTabletServer(/*num_data_dirs=*/ kNumDirs));
+  }
+};
+
+// Test that applies random operations to a tablet with a non-zero disk-failure
+// injection rate.
+TEST_F(TabletServerDiskFailureTest, TestRandomOpSequence) {
+  if (!AllowSlowTests()) {
+    LOG(INFO) << "Not running slow test. To run, use KUDU_ALLOW_SLOW_TESTS=1";
+    return;
+  }
+  typedef vector<RowOperationsPB::Type> OpTypeList;
+  const OpTypeList kOpsIfKeyNotPresent = { RowOperationsPB::INSERT, RowOperationsPB::UPSERT
};
+  const OpTypeList kOpsIfKeyPresent = { RowOperationsPB::UPSERT, RowOperationsPB::UPDATE,
+                                        RowOperationsPB::DELETE };
+  const int kMaxKey = 100000;
+
+  // Set these way up-front so we can change a single value to actually start
+  // injecting errors.
+  FLAGS_crash_on_eio = false;
+  FLAGS_env_inject_eio_globs =
+    JoinPathSegments(mini_server_->options()->fs_opts.data_roots[1], "**");
+
+  set<int> keys;
+  const auto GetRandomString = [] {
+    return StringPrintf("%d", rand() % kMaxKey);
+  };
+
+  // Perform a random op (insert, update, upsert, or delete).
+  const auto PerformOp = [&] {
+    // Set up the request.
+    WriteRequestPB req;
+    req.set_tablet_id(kTabletId);
+    RETURN_NOT_OK(SchemaToPB(schema_, req.mutable_schema()));
+
+    // Set up the other state.
+    WriteResponsePB resp;
+    RpcController controller;
+    RowOperationsPB::Type op_type;
+    int key = rand() % kMaxKey;
+    auto key_iter = keys.find(key);
+    if (key_iter == keys.end()) {
+      // If the key already exists, insert or upsert.
+      op_type = kOpsIfKeyNotPresent[rand() % kOpsIfKeyNotPresent.size()];
+    } else {
+      // ... else we can do anything but insert.
+      op_type = kOpsIfKeyPresent[rand() % kOpsIfKeyPresent.size()];
+    }
+
+    // Add the op to the request.
+    if (op_type != RowOperationsPB::DELETE) {
+      AddTestRowToPB(op_type, schema_, key, key, GetRandomString(),
+                     req.mutable_row_operations());
+      keys.insert(key);
+    } else {
+      AddTestKeyToPB(RowOperationsPB::DELETE, schema_, key, req.mutable_row_operations());
+      keys.erase(key_iter);
+    }
+
+    // Finally, write to the server and log the response.
+    RETURN_NOT_OK_PREPEND(proxy_->Write(req, &resp, &controller), "Failed to write");
+    LOG(INFO) << "Tablet server responded with: " << SecureDebugString(resp);
+    return resp.has_error() ?  StatusFromPB(resp.error().status()) : Status::OK();
+  };
+
+  // Perform some arbitrarily large number of ops, with some pauses to encourage flushes.
+  for (int i = 0; i < 500; i++) {
+    if (i % 10) {
+      SleepFor(MonoDelta::FromMilliseconds(100));
+    }
+    ASSERT_OK(PerformOp());
+  }
+  // At this point, a bunch of operations have gone through successfully. Fail
+  // one of the disks that the tablet lives on.
+  FLAGS_env_inject_eio = 0.01;
+
+  // The tablet will eventually be failed and will not be able to accept
+  // updates. Keep on inserting until that happens.
+  ASSERT_EVENTUALLY([&] {
+    Status s;
+    for (int i = 0; i < 150 && s.ok(); i++) {
+      s = PerformOp();
+    }
+    ASSERT_FALSE(s.ok());
+  });
+  LOG(INFO) << "Failure was caught by an op!";
+  ASSERT_EVENTUALLY([&] {
+    ASSERT_EQ(tablet::FAILED, tablet_replica_->state());
+  });
+  LOG(INFO) << "Tablet was successfully failed";
+}
+
 TEST_F(TabletServerTest, TestInsert) {
   WriteRequestPB req;
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/a18710cb/src/kudu/tserver/tablet_server_test_util.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/tablet_server_test_util.cc b/src/kudu/tserver/tablet_server_test_util.cc
index 7ac3bec..61dc3a5 100644
--- a/src/kudu/tserver/tablet_server_test_util.cc
+++ b/src/kudu/tserver/tablet_server_test_util.cc
@@ -33,11 +33,11 @@ using std::shared_ptr;
 
 void CreateTsClientProxies(const Sockaddr& addr,
                            const shared_ptr<Messenger>& messenger,
-                           gscoped_ptr<TabletCopyServiceProxy>* tablet_copy_proxy,
-                           gscoped_ptr<TabletServerServiceProxy>* tablet_server_proxy,
-                           gscoped_ptr<TabletServerAdminServiceProxy>* admin_proxy,
-                           gscoped_ptr<ConsensusServiceProxy>* consensus_proxy,
-                           gscoped_ptr<server::GenericServiceProxy>* generic_proxy)
{
+                           std::unique_ptr<TabletCopyServiceProxy>* tablet_copy_proxy,
+                           std::unique_ptr<TabletServerServiceProxy>* tablet_server_proxy,
+                           std::unique_ptr<TabletServerAdminServiceProxy>* admin_proxy,
+                           std::unique_ptr<ConsensusServiceProxy>* consensus_proxy,
+                           std::unique_ptr<server::GenericServiceProxy>* generic_proxy)
{
   const auto& host = addr.host();
   tablet_copy_proxy->reset(new TabletCopyServiceProxy(messenger, addr, host));
   tablet_server_proxy->reset(new TabletServerServiceProxy(messenger, addr, host));

http://git-wip-us.apache.org/repos/asf/kudu/blob/a18710cb/src/kudu/tserver/tablet_server_test_util.h
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/tablet_server_test_util.h b/src/kudu/tserver/tablet_server_test_util.h
index 104e5ce..0281d12 100644
--- a/src/kudu/tserver/tablet_server_test_util.h
+++ b/src/kudu/tserver/tablet_server_test_util.h
@@ -14,13 +14,10 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-#ifndef KUDU_TSERVER_TABLET_SERVER_TEST_UTIL_H_
-#define KUDU_TSERVER_TABLET_SERVER_TEST_UTIL_H_
+#pragma once
 
 #include <memory>
 
-#include "kudu/gutil/gscoped_ptr.h"
-
 namespace kudu {
 class Sockaddr;
 
@@ -44,13 +41,11 @@ class TabletServerServiceProxy;
 // Create tablet server client proxies for tests.
 void CreateTsClientProxies(const Sockaddr& addr,
                            const std::shared_ptr<rpc::Messenger>& messenger,
-                           gscoped_ptr<TabletCopyServiceProxy>* tablet_copy_proxy,
-                           gscoped_ptr<TabletServerServiceProxy>* tablet_server_proxy,
-                           gscoped_ptr<TabletServerAdminServiceProxy>* admin_proxy,
-                           gscoped_ptr<consensus::ConsensusServiceProxy>* consensus_proxy,
-                           gscoped_ptr<server::GenericServiceProxy>* generic_proxy);
+                           std::unique_ptr<TabletCopyServiceProxy>* tablet_copy_proxy,
+                           std::unique_ptr<TabletServerServiceProxy>* tablet_server_proxy,
+                           std::unique_ptr<TabletServerAdminServiceProxy>* admin_proxy,
+                           std::unique_ptr<consensus::ConsensusServiceProxy>* consensus_proxy,
+                           std::unique_ptr<server::GenericServiceProxy>* generic_proxy);
 
 } // namespace tserver
 } // namespace kudu
-
-#endif // KUDU_TSERVER_TABLET_SERVER_TEST_UTIL_H_


Mime
View raw message