From commits-return-5327-archive-asf-public=cust-asf.ponee.io@kudu.apache.org Mon Feb 26 21:00:45 2018 Return-Path: X-Original-To: archive-asf-public@cust-asf.ponee.io Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by mx-eu-01.ponee.io (Postfix) with SMTP id 57CF418064A for ; Mon, 26 Feb 2018 21:00:45 +0100 (CET) Received: (qmail 86798 invoked by uid 500); 26 Feb 2018 20:00:44 -0000 Mailing-List: contact commits-help@kudu.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@kudu.apache.org Delivered-To: mailing list commits@kudu.apache.org Received: (qmail 86788 invoked by uid 99); 26 Feb 2018 20:00:44 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 26 Feb 2018 20:00:44 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id A968CEB4BA; Mon, 26 Feb 2018 20:00:43 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: adar@apache.org To: commits@kudu.apache.org Message-Id: <77fd3e006bc8405c90a2265220d2d3bd@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: kudu git commit: webserver-stress-itest: fix flakiness Date: Mon, 26 Feb 2018 20:00:43 +0000 (UTC) Repository: kudu Updated Branches: refs/heads/master f3382a9b8 -> c04981d81 webserver-stress-itest: fix flakiness This fixes a source of flakiness I found on the flaky dashboard. In some runs of this test, we'd hit the following interleaving: - we start the master with webserver_port=0 and it picks some port (eg 35000) - we stop the master - the curl threads are still running, and one of them picks port 35000 as the local side of its TCP connection. It then tries to connect to 35000 and hits the dreaded "tcp loop connect" phenomenon[1] in which it actually connects to _itself_. Thus it just hangs there occupying the port - we try to start the master again, and it fails to bind - we now time out trying to Join() on the curl thread, which is waiting forever for itself to respond to an HTTP request. The fix is to use non-ephemeral ports for the webserver as we already do for the RPC server. I additionally added timeouts to the curl calls. [1] http://www.rampa.sk/static/tcpLoopConnect.html Change-Id: If754d7f47a4c9c04bae3e9ef31acad801dd4db9b Reviewed-on: http://gerrit.cloudera.org:8080/9414 Tested-by: Kudu Jenkins Reviewed-by: Adar Dembo Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/c04981d8 Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/c04981d8 Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/c04981d8 Branch: refs/heads/master Commit: c04981d81d547819416155f2295e1210d9d7c39a Parents: f3382a9 Author: Todd Lipcon Authored: Thu Feb 22 19:28:53 2018 -0800 Committer: Adar Dembo Committed: Mon Feb 26 20:00:20 2018 +0000 ---------------------------------------------------------------------- src/kudu/integration-tests/linked_list-test-util.h | 6 +++++- src/kudu/integration-tests/webserver-stress-itest.cc | 15 +++++++++++++++ src/kudu/util/curl_util.cc | 6 +++++- src/kudu/util/curl_util.h | 7 +++++++ 4 files changed, 32 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/c04981d8/src/kudu/integration-tests/linked_list-test-util.h ---------------------------------------------------------------------- diff --git a/src/kudu/integration-tests/linked_list-test-util.h b/src/kudu/integration-tests/linked_list-test-util.h index 3f873ed..ba04ea6 100644 --- a/src/kudu/integration-tests/linked_list-test-util.h +++ b/src/kudu/integration-tests/linked_list-test-util.h @@ -363,6 +363,8 @@ class PeriodicWebUIChecker { private: void CheckThread() { EasyCurl curl; + // Set some timeout so that if the page deadlocks, we fail the test. + curl.set_timeout(MonoDelta::FromSeconds(120)); faststring dst; LOG(INFO) << "Curl thread will poll the following URLs every " << period_.ToMilliseconds() << " ms: "; @@ -373,9 +375,11 @@ class PeriodicWebUIChecker { // Poll all of the URLs. const MonoTime start = MonoTime::Now(); for (const auto& url : urls_) { - if (curl.FetchURL(url, &dst).ok()) { + Status s = curl.FetchURL(url, &dst); + if (s.ok()) { CHECK_GT(dst.length(), 0); } + CHECK(!s.IsTimedOut()) << "timed out fetching url " << url; } // Sleep until the next period const MonoDelta elapsed = MonoTime::Now() - start; http://git-wip-us.apache.org/repos/asf/kudu/blob/c04981d8/src/kudu/integration-tests/webserver-stress-itest.cc ---------------------------------------------------------------------- diff --git a/src/kudu/integration-tests/webserver-stress-itest.cc b/src/kudu/integration-tests/webserver-stress-itest.cc index 08ed172..51e4083 100644 --- a/src/kudu/integration-tests/webserver-stress-itest.cc +++ b/src/kudu/integration-tests/webserver-stress-itest.cc @@ -47,7 +47,22 @@ TEST_F(KuduTest, TestWebUIDoesNotCrashCluster) { const int kNumTablets = 50; ExternalMiniClusterOptions opts; + // Force specific ports so that we can restart and guarantee we + // can bind the same port. If we use ephemeral ports, it's possible + // for one of the 'curl' threads to grab one of the ports as the local + // side of a client TCP connection while the server is down, preventing + // it from restarting. Choosing ports from the non-ephemeral range + // prevents this. opts.master_rpc_ports = { 11010, 11011, 11012 }; +#ifdef __linux__ + // We can only do explicit webserver ports on Linux, where we use + // IPs like 127.x.y.z to bind the minicluster servers to different + // hosts. This might make the test marginally flaky on OSX, but + // it's easier than adding the ability to pipe separate webserver + // ports to each server. + opts.extra_master_flags.emplace_back("-webserver_port=11013"); + opts.extra_tserver_flags.emplace_back("-webserver_port=11014"); +#endif opts.num_masters = opts.master_rpc_ports.size(); ExternalMiniCluster cluster(opts); http://git-wip-us.apache.org/repos/asf/kudu/blob/c04981d8/src/kudu/util/curl_util.cc ---------------------------------------------------------------------- diff --git a/src/kudu/util/curl_util.cc b/src/kudu/util/curl_util.cc index 44ef089..f061dc5 100644 --- a/src/kudu/util/curl_util.cc +++ b/src/kudu/util/curl_util.cc @@ -94,7 +94,11 @@ Status EasyCurl::DoRequest(const std::string& url, } RETURN_NOT_OK(TranslateError(curl_easy_setopt(curl_, CURLOPT_HTTPAUTH, CURLAUTH_ANY))); - + if (timeout_.Initialized()) { + RETURN_NOT_OK(TranslateError(curl_easy_setopt(curl_, CURLOPT_NOSIGNAL, 1))); + RETURN_NOT_OK(TranslateError(curl_easy_setopt(curl_, CURLOPT_TIMEOUT_MS, + timeout_.ToMilliseconds()))); + } RETURN_NOT_OK(TranslateError(curl_easy_perform(curl_))); long rc; // NOLINT(runtime/int) curl wants a long RETURN_NOT_OK(TranslateError(curl_easy_getinfo(curl_, CURLINFO_RESPONSE_CODE, &rc))); http://git-wip-us.apache.org/repos/asf/kudu/blob/c04981d8/src/kudu/util/curl_util.h ---------------------------------------------------------------------- diff --git a/src/kudu/util/curl_util.h b/src/kudu/util/curl_util.h index 797c8a6..49ba2d4 100644 --- a/src/kudu/util/curl_util.h +++ b/src/kudu/util/curl_util.h @@ -20,6 +20,7 @@ #include #include "kudu/gutil/macros.h" +#include "kudu/util/monotime.h" #include "kudu/util/status.h" typedef void CURL; @@ -58,6 +59,10 @@ class EasyCurl { return_headers_ = v; } + void set_timeout(MonoDelta t) { + timeout_ = t; + } + private: // Do a request. If 'post_data' is non-NULL, does a POST. // Otherwise, does a GET. @@ -72,6 +77,8 @@ class EasyCurl { // Whether to return the HTTP headers with the response. bool return_headers_ = false; + MonoDelta timeout_; + DISALLOW_COPY_AND_ASSIGN(EasyCurl); };