kudu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ale...@apache.org
Subject [2/2] kudu git commit: catalog_manager_tsk-itest: ensure that test eventually makes progress
Date Fri, 17 Nov 2017 23:44:20 GMT
catalog_manager_tsk-itest: ensure that test eventually makes progress

This test previously tried to introduce a lot of master leader elections
by setting a very low heartbeat and failure interval. This worked, but
sometimes worked so well that the test never made progress and couldn't
obtain a stable leader long enough to create a table.

This patch changes the test to instead use a separate thread which
triggers elections manually on all the leaders. The elections start off
very frequent and then back off as the test progresses to ensure that by
the end, the leaders do actually make progress.

I verified that this still covers the case of a failed write when
writing TSKs by changing the RETURN_NOT_OK to a CHECK_OK when storing
the TSK. With the CHECK_OK, the test failed nearly immediately.

Change-Id: I3ecda0c269225e7674bc384fee652576b110ae7b
Reviewed-on: http://gerrit.cloudera.org:8080/8567
Tested-by: Kudu Jenkins
Reviewed-by: Alexey Serbin <aserbin@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/8e6bfa9f
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/8e6bfa9f
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/8e6bfa9f

Branch: refs/heads/master
Commit: 8e6bfa9fb8aa7292e75a9fd3dcf9c50c8ea5191e
Parents: c4006ae
Author: Todd Lipcon <todd@apache.org>
Authored: Wed Nov 15 21:49:17 2017 -0800
Committer: Alexey Serbin <aserbin@cloudera.com>
Committed: Fri Nov 17 23:43:13 2017 +0000

----------------------------------------------------------------------
 .../catalog_manager_tsk-itest.cc                | 66 ++++++++++++++------
 1 file changed, 46 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/8e6bfa9f/src/kudu/integration-tests/catalog_manager_tsk-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/catalog_manager_tsk-itest.cc b/src/kudu/integration-tests/catalog_manager_tsk-itest.cc
index 0a0a0e4..9812102 100644
--- a/src/kudu/integration-tests/catalog_manager_tsk-itest.cc
+++ b/src/kudu/integration-tests/catalog_manager_tsk-itest.cc
@@ -16,12 +16,17 @@
 // under the License.
 
 #include <algorithm>
+#include <atomic>
+#include <cstdlib>
 #include <cstdint>
 #include <iterator>
 #include <memory>
+#include <ostream>
 #include <string>
+#include <thread>
 #include <vector>
 
+#include <glog/logging.h>
 #include <gtest/gtest.h>
 
 #include "kudu/client/client-test-util.h"
@@ -30,11 +35,18 @@
 #include "kudu/client/shared_ptr.h"
 #include "kudu/client/write_op.h"
 #include "kudu/common/partial_row.h"
+#include "kudu/consensus/consensus.pb.h"
+#include "kudu/consensus/consensus.proxy.h"
 #include "kudu/gutil/gscoped_ptr.h"
 #include "kudu/gutil/strings/substitute.h"
+#include "kudu/master/sys_catalog.h"
 #include "kudu/mini-cluster/external_mini_cluster.h"
+#include "kudu/rpc/rpc_controller.h"
 #include "kudu/tablet/key_value_test_schema.h"
 #include "kudu/util/monotime.h"
+#include "kudu/util/net/sockaddr.h"
+#include "kudu/util/scoped_cleanup.h"
+#include "kudu/util/status.h"
 #include "kudu/util/test_macros.h"
 #include "kudu/util/test_util.h"
 
@@ -47,6 +59,7 @@ using kudu::client::KuduTable;
 using kudu::client::KuduTableCreator;
 using kudu::cluster::ExternalMiniCluster;
 using kudu::cluster::ExternalMiniClusterOptions;
+using std::atomic;
 using std::back_inserter;
 using std::copy;
 using std::string;
@@ -74,22 +87,10 @@ class CatalogManagerTskITest : public KuduTest {
     cluster_opts_.master_rpc_ports = { 11030, 11031, 11032 };
     cluster_opts_.num_tablet_servers = num_tservers_;
 
-    // Add common flags for both masters and tservers.
-    const vector<string> common_flags = {
-      Substitute("--raft_heartbeat_interval_ms=$0", hb_interval_ms_),
-    };
-    copy(common_flags.begin(), common_flags.end(),
-        back_inserter(cluster_opts_.extra_master_flags));
-    copy(common_flags.begin(), common_flags.end(),
-        back_inserter(cluster_opts_.extra_tserver_flags));
-
     // Add master-only flags.
     const vector<string> master_flags = {
       "--catalog_manager_inject_latency_prior_tsk_write_ms=1000",
       "--raft_enable_pre_election=false",
-      Substitute("--leader_failure_exp_backoff_max_delta_ms=$0",
-          hb_interval_ms_ * 4),
-      "--leader_failure_max_missed_heartbeat_periods=1.0",
       "--master_non_leader_masters_propagate_tsk",
       "--tsk_rotation_seconds=2",
     };
@@ -113,7 +114,7 @@ class CatalogManagerTskITest : public KuduTest {
     using ::kudu::client::sp::shared_ptr;
     static const char* kTableName = "test-table";
     // Using the setting for both RPC and admin operation timeout.
-    const MonoDelta timeout = MonoDelta::FromSeconds(600);
+    const MonoDelta timeout = MonoDelta::FromSeconds(120);
     KuduClientBuilder builder;
     builder.default_admin_operation_timeout(timeout).default_rpc_timeout(timeout);
     shared_ptr<KuduClient> client;
@@ -157,21 +158,46 @@ class CatalogManagerTskITest : public KuduTest {
 
 // Check that master servers do not crash on change of leadership while
 // writing newly generated TSKs. The leadership changes are provoked
-// by the injected latency just after generating a TSK but prior to writing it
-// into the system table: setting --leader_failure_max_missed_heartbeat_periods
-// flag to just one heartbeat period and unsetting --raft_enable_pre_election
-// gives high chances of re-election to happen while current leader has blocked
-// its leadership-related activity.
+// by a separate thread which just forces each leader to call elections
+// in turn, separated by random sleeps.
 TEST_F(CatalogManagerTskITest, LeadershipChangeOnTskGeneration) {
   NO_FATALS(StartCluster());
 
+  std::atomic<bool> done { false };
+  std::thread t([&]() {
+      // At the start of the test, cause leader elections rapidly,
+      // but then space them out further and further as the test goes
+      // to ensure that we eventually do get a successful run.
+      double max_sleep_ms = 5;
+      while (!done) {
+        for (int i = 0; i < cluster_->num_masters() && !done; i++) {
+          LOG(INFO) << "Attempting to promote master " << i << " to leader";
+          consensus::ConsensusServiceProxy proxy(
+              cluster_->messenger(), cluster_->master(i)->bound_rpc_addr(), "master");
+          consensus::RunLeaderElectionRequestPB req;
+          consensus::RunLeaderElectionResponsePB resp;
+          rpc::RpcController rpc;
+          req.set_tablet_id(master::SysCatalogTable::kSysCatalogTabletId);
+          req.set_dest_uuid(cluster_->master(i)->uuid());
+          rpc.set_timeout(MonoDelta::FromSeconds(10));
+          WARN_NOT_OK(proxy.RunLeaderElection(req, &resp, &rpc),
+                      "couldn't promote new leader");
+          int s = rand() % static_cast<int>(max_sleep_ms);
+          LOG(INFO) << "Sleeping for " << s;
+          SleepFor(MonoDelta::FromMilliseconds(s));
+          max_sleep_ms = std::min(max_sleep_ms * 1.1, 3000.0);
+        }
+      }
+    });
+  SCOPED_CLEANUP({ done = true; t.join(); });
+
   const MonoTime t_stop = MonoTime::Now() +
       MonoDelta::FromSeconds(run_time_seconds_);
   while (MonoTime::Now() < t_stop) {
     NO_FATALS(SmokeTestCluster());
+    NO_FATALS(cluster_->AssertNoCrashes());
   }
-
-  NO_FATALS(cluster_->AssertNoCrashes());
+  LOG(INFO) << "Done. Waiting on elector thread.";
 }
 
 } // namespace master


Mime
View raw message