Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 2BFF5200CA9 for ; Fri, 16 Jun 2017 23:44:01 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id 2899C160BEC; Fri, 16 Jun 2017 21:44:01 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 4676F160BC0 for ; Fri, 16 Jun 2017 23:44:00 +0200 (CEST) Received: (qmail 98383 invoked by uid 500); 16 Jun 2017 21:43:59 -0000 Mailing-List: contact commits-help@kudu.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@kudu.apache.org Delivered-To: mailing list commits@kudu.apache.org Received: (qmail 98374 invoked by uid 99); 16 Jun 2017 21:43:59 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 16 Jun 2017 21:43:59 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 62839DFF16; Fri, 16 Jun 2017 21:43:59 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: alexey@apache.org To: commits@kudu.apache.org Date: Fri, 16 Jun 2017 21:43:59 -0000 Message-Id: X-Mailer: ASF-Git Admin Mailer Subject: [1/3] kudu git commit: [registration-test] fix flake in TestTabletReports archived-at: Fri, 16 Jun 2017 21:44:01 -0000 Repository: kudu Updated Branches: refs/heads/master 834bb35e8 -> c0798a942 [registration-test] fix flake in TestTabletReports Prior to the fix, the failure rate attributed to this flake was about 0.3% in 1K run: http://dist-test.cloudera.org//job?job_id=aserbin.1497582474.17739 After the fix, no failures in 1K run: http://dist-test.cloudera.org//job?job_id=aserbin.1497629325.23427 Both runs were of DEBUG build run with -stress_cpu_threads=8 flag. The crux of the fix is to use AssertEvetually instead of relying on a hard-coded delay to capture system catalog's metric. That's because the metrics of interest are updated upon processing tablet reports from tservers which contain tablet consensus status information. The latter are sent with a tserver-->master heartbeat when the tserver determines that it has become the leader for one of its tablets. In addition to that, this patch includes a clean-up of the test code. Change-Id: I906465ad220236538175c80972ae055193f9bb45 Reviewed-on: http://gerrit.cloudera.org:8080/7209 Tested-by: Kudu Jenkins Reviewed-by: Alexey Serbin Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/722f8039 Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/722f8039 Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/722f8039 Branch: refs/heads/master Commit: 722f80399a748478c295d180459d1136c59edc14 Parents: 834bb35 Author: Alexey Serbin Authored: Fri Jun 16 09:18:07 2017 -0700 Committer: Alexey Serbin Committed: Fri Jun 16 21:23:23 2017 +0000 ---------------------------------------------------------------------- src/kudu/integration-tests/registration-test.cc | 108 ++++++++++--------- 1 file changed, 58 insertions(+), 50 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/722f8039/src/kudu/integration-tests/registration-test.cc ---------------------------------------------------------------------- diff --git a/src/kudu/integration-tests/registration-test.cc b/src/kudu/integration-tests/registration-test.cc index a478616..4ae7bd3 100644 --- a/src/kudu/integration-tests/registration-test.cc +++ b/src/kudu/integration-tests/registration-test.cc @@ -15,10 +15,13 @@ // specific language governing permissions and limitations // under the License. -#include -#include #include #include +#include +#include + +#include +#include #include "kudu/common/schema.h" #include "kudu/common/wire_protocol-test-util.h" @@ -51,11 +54,11 @@ DECLARE_int32(heartbeat_interval_ms); METRIC_DECLARE_counter(rows_inserted); METRIC_DECLARE_counter(rows_updated); -namespace kudu { - -using std::vector; using std::shared_ptr; using std::string; +using std::vector; + +namespace kudu { using master::CatalogManager; using master::CreateTableRequestPB; @@ -73,7 +76,7 @@ using tserver::MiniTabletServer; void CreateTableForTesting(MiniMaster* mini_master, const string& table_name, const Schema& schema, - string *tablet_id) { + string* tablet_id) { { CreateTableRequestPB req; CreateTableResponsePB resp; @@ -84,7 +87,7 @@ void CreateTableForTesting(MiniMaster* mini_master, CatalogManager* catalog = mini_master->master()->catalog_manager(); CatalogManager::ScopedLeaderSharedLock l(catalog); ASSERT_OK(l.first_failed_status()); - ASSERT_OK(catalog->CreateTable(&req, &resp, NULL)); + ASSERT_OK(catalog->CreateTable(&req, &resp, nullptr)); } int wait_time = 1000; @@ -135,20 +138,20 @@ void CreateTableForTesting(MiniMaster* mini_master, class RegistrationTest : public KuduTest { public: RegistrationTest() - : schema_({ ColumnSchema("c1", UINT32) }, 1) { + : schema_({ ColumnSchema("c1", UINT32) }, 1) { } - virtual void SetUp() OVERRIDE { + void SetUp() override { // Make heartbeats faster to speed test runtime. FLAGS_heartbeat_interval_ms = 10; KuduTest::SetUp(); - cluster_.reset(new MiniCluster(env_, MiniClusterOptions())); + cluster_.reset(new MiniCluster(env_, {})); ASSERT_OK(cluster_->Start()); } - virtual void TearDown() OVERRIDE { + void TearDown() override { cluster_->Shutdown(); } @@ -175,26 +178,32 @@ class RegistrationTest : public KuduTest { Status WaitForReplicaCount(const string& tablet_id, int expected_count, - TabletLocationsPB* locations) { + TabletLocationsPB* locations = nullptr) { while (true) { master::CatalogManager* catalog = cluster_->mini_master()->master()->catalog_manager(); Status s; + TabletLocationsPB loc; do { master::CatalogManager::ScopedLeaderSharedLock l(catalog); const Status& ls = l.first_failed_status(); - if (ls.IsServiceUnavailable()) { + if (ls.IsServiceUnavailable() || ls.IsIllegalState()) { // ServiceUnavailable means catalog manager is not yet ready - // to serve requests -- try again later. + // to serve requests; IllegalState means the catalog is not the + // leader yet. That's an indication of a transient state where it's + // it's necessary to try again later. break; // exiting out of the 'do {...} while (false)' scope } RETURN_NOT_OK(ls); - s = catalog->GetTabletLocations(tablet_id, locations); + s = catalog->GetTabletLocations(tablet_id, &loc); } while (false); - if (s.ok() && locations->replicas_size() == expected_count) { + if (s.ok() && loc.replicas_size() == expected_count) { + if (locations) { + *locations = std::move(loc); + } return Status::OK(); } - SleepFor(MonoDelta::FromMilliseconds(1)); + SleepFor(MonoDelta::FromMilliseconds(10)); } } @@ -215,14 +224,14 @@ TEST_F(RegistrationTest, TestTSRegisters) { descs[0]->GetRegistration(®); { SCOPED_TRACE(SecureShortDebugString(reg)); - ASSERT_EQ(SecureShortDebugString(reg).find("0.0.0.0"), string::npos) - << "Should not include wildcards in registration"; + ASSERT_EQ(string::npos, SecureShortDebugString(reg).find("0.0.0.0")) + << "Should not include wildcards in registration"; } ASSERT_NO_FATAL_FAILURE(CheckTabletServersPage()); // Restart the master, so it loses the descriptor, and ensure that the - // hearbeater thread handles re-registering. + // heartbeater thread handles re-registering. cluster_->mini_master()->Shutdown(); ASSERT_OK(cluster_->mini_master()->Restart()); @@ -255,62 +264,61 @@ TEST_F(RegistrationTest, TestMultipleTS) { // to something more appropriate - doesn't seem worth having separate // whole test suites for registration, tablet reports, etc. TEST_F(RegistrationTest, TestTabletReports) { - string tablet_id_1; - string tablet_id_2; - - MiniTabletServer* ts = cluster_->mini_tablet_server(0); - string ts_root = cluster_->GetTabletServerFsRoot(0); - auto GetCatalogMetric = [&](CounterPrototype& prototype) { - auto metrics = cluster_->mini_master()->master()->catalog_manager()->sys_catalog() - ->tablet_replica()->tablet()->GetMetricEntity(); + auto metrics = cluster_->mini_master()->master()->catalog_manager()-> + sys_catalog()->tablet_replica()->tablet()->GetMetricEntity(); return prototype.Instantiate(metrics)->value(); }; - int startup_rows_inserted = GetCatalogMetric(METRIC_rows_inserted); + const int startup_rows_inserted = GetCatalogMetric(METRIC_rows_inserted); // Add a table, make sure it reports itself. - CreateTableForTesting(cluster_->mini_master(), "fake-table", schema_, &tablet_id_1); - - TabletLocationsPB locs; - ASSERT_OK(WaitForReplicaCount(tablet_id_1, 1, &locs)); - ASSERT_EQ(1, locs.replicas_size()); - LOG(INFO) << "Tablet successfully reported on " << locs.replicas(0).ts_info().permanent_uuid(); + string tablet_id_1; + NO_FATALS(CreateTableForTesting( + cluster_->mini_master(), "tablet-reports-1", schema_, &tablet_id_1)); + TabletLocationsPB locs_1; + ASSERT_OK(WaitForReplicaCount(tablet_id_1, 1, &locs_1)); + ASSERT_EQ(1, locs_1.replicas_size()); // Check that we inserted the right number of rows for the new single-tablet table // (one for the table, one for the tablet). - int post_create_rows_inserted = GetCatalogMetric(METRIC_rows_inserted); + const int post_create_rows_inserted = GetCatalogMetric(METRIC_rows_inserted); EXPECT_EQ(2, post_create_rows_inserted - startup_rows_inserted) << "Should have inserted one row each for the table and tablet"; // Add another table, make sure it is reported via incremental. - CreateTableForTesting(cluster_->mini_master(), "fake-table2", schema_, &tablet_id_2); - ASSERT_OK(WaitForReplicaCount(tablet_id_2, 1, &locs)); + string tablet_id_2; + NO_FATALS(CreateTableForTesting( + cluster_->mini_master(), "tablet-reports-2", schema_, &tablet_id_2)); + ASSERT_OK(WaitForReplicaCount(tablet_id_2, 1)); // Shut down the whole system, bring it back up, and make sure the tablets // are reported. - ts->Shutdown(); - cluster_->mini_master()->Shutdown(); - ASSERT_OK(cluster_->mini_master()->Restart()); - ASSERT_OK(ts->Start()); - - ASSERT_OK(WaitForReplicaCount(tablet_id_1, 1, &locs)); - ASSERT_OK(WaitForReplicaCount(tablet_id_2, 1, &locs)); - - SleepFor(MonoDelta::FromSeconds(1)); + cluster_->Shutdown(); + ASSERT_OK(cluster_->Start()); // After restart, check that the tablet reports produced the expected number of // writes to the catalog table: // - No inserts, because there are no new tablets. - // - Two updates, since both replicas should have increased their term on restart. - EXPECT_EQ(0, GetCatalogMetric(METRIC_rows_inserted)); - EXPECT_EQ(2, GetCatalogMetric(METRIC_rows_updated)); + // - Two updates, since both replicas increase their term on restart. + // + // It can take some time for the TS to re-heartbeat. To avoid flakiness, here + // it's easier to wait for the target non-zero metric value instead of + // hard-coding an extra delay. + AssertEventually([&]() { + ASSERT_EQ(0, GetCatalogMetric(METRIC_rows_inserted)); + ASSERT_EQ(2, GetCatalogMetric(METRIC_rows_updated)); + }); // If we restart just the master, it should not write any data to the catalog, since the // tablets themselves are not changing term, etc. cluster_->mini_master()->Shutdown(); ASSERT_OK(cluster_->mini_master()->Restart()); + // Sleep for a second to make sure the TS has plenty of time to re-heartbeat. + // The metrics are updated after processing TS heartbeats, and we want to + // capture updated metric readings. SleepFor(MonoDelta::FromSeconds(1)); + EXPECT_EQ(0, GetCatalogMetric(METRIC_rows_inserted)); EXPECT_EQ(0, GetCatalogMetric(METRIC_rows_updated)); }