Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 25265200D3B for ; Thu, 26 Oct 2017 18:12:41 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id 21D051609E8; Thu, 26 Oct 2017 16:12:41 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 7071A160BF2 for ; Thu, 26 Oct 2017 18:12:40 +0200 (CEST) Received: (qmail 15802 invoked by uid 500); 26 Oct 2017 16:12:39 -0000 Mailing-List: contact commits-help@kudu.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@kudu.apache.org Delivered-To: mailing list commits@kudu.apache.org Received: (qmail 15774 invoked by uid 99); 26 Oct 2017 16:12:39 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 26 Oct 2017 16:12:39 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 93963DFC35; Thu, 26 Oct 2017 16:12:35 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: adar@apache.org To: commits@kudu.apache.org Date: Thu, 26 Oct 2017 16:12:36 -0000 Message-Id: In-Reply-To: References: X-Mailer: ASF-Git Admin Mailer Subject: [2/4] kudu git commit: data_dirs: be permissive if RefreshIsFull fails archived-at: Thu, 26 Oct 2017 16:12:41 -0000 data_dirs: be permissive if RefreshIsFull fails In testing disk failures, at the DataDirManager-level, RefreshIsFull() is a point of failure that can fail various functions, e.g. for block placement. This doesn't need to be the case; it should be sufficient to instead just not include the failed directory candidate. This is favorable to returning early because RefreshIsFull() may cause these functions to return before any error-handling has happened, and any disk-failure handling will be triggered regardless the next time a block is written to the bad disk. Change-Id: Iffd0342e52e9f6a76ba17c179a36a8a971b94786 Reviewed-on: http://gerrit.cloudera.org:8080/8389 Reviewed-by: Adar Dembo Tested-by: Adar Dembo Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/e03b8e44 Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/e03b8e44 Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/e03b8e44 Branch: refs/heads/master Commit: e03b8e4437920239ca5542143d36c7bd5a496eb2 Parents: 69b82b5 Author: Andrew Wong Authored: Wed Oct 25 14:04:29 2017 -0700 Committer: Adar Dembo Committed: Thu Oct 26 16:11:42 2017 +0000 ---------------------------------------------------------------------- src/kudu/fs/data_dirs.cc | 27 ++++++++++++++------------- src/kudu/fs/data_dirs.h | 2 +- 2 files changed, 15 insertions(+), 14 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/e03b8e44/src/kudu/fs/data_dirs.cc ---------------------------------------------------------------------- diff --git a/src/kudu/fs/data_dirs.cc b/src/kudu/fs/data_dirs.cc index 5e9b0d0..1ae0e9f 100644 --- a/src/kudu/fs/data_dirs.cc +++ b/src/kudu/fs/data_dirs.cc @@ -670,7 +670,7 @@ Status DataDirManager::CreateDataDirGroup(const string& tablet_id, return Status::IOError("No healthy data directories available", "", ENODEV); } } - RETURN_NOT_OK(GetDirsForGroupUnlocked(group_target_size, &group_indices)); + GetDirsForGroupUnlocked(group_target_size, &group_indices); if (PREDICT_FALSE(group_indices.empty())) { return Status::IOError("All healthy data directories are full", "", ENOSPC); } @@ -724,8 +724,9 @@ Status DataDirManager::GetNextDataDir(const CreateBlockOptions& opts, DataDir** for (int i : random_indices) { int uuid_idx = (*group_uuid_indices)[i]; DataDir* candidate = FindOrDie(data_dir_by_uuid_idx_, uuid_idx); - RETURN_NOT_OK(candidate->RefreshIsFull(DataDir::RefreshMode::EXPIRED_ONLY)); - if (!candidate->is_full()) { + Status s = candidate->RefreshIsFull(DataDir::RefreshMode::EXPIRED_ONLY); + WARN_NOT_OK(s, Substitute("failed to refresh fullness of $0", candidate->dir())); + if (s.ok() && !candidate->is_full()) { *dir = candidate; return Status::OK(); } @@ -740,8 +741,8 @@ Status DataDirManager::GetNextDataDir(const CreateBlockOptions& opts, DataDir** metrics_->data_dirs_full->value(), dirs_state_str); } return Status::IOError(Substitute("No directories available to add to $0directory group ($1 " - "dirs total, $2).", tablet_id_str, data_dirs_.size(), dirs_state_str), - "", ENOSPC); + "dirs total, $2).", tablet_id_str, data_dirs_.size(), dirs_state_str), + "", ENOSPC); } void DataDirManager::DeleteDataDirGroup(const std::string& tablet_id) { @@ -768,19 +769,20 @@ bool DataDirManager::GetDataDirGroupPB(const std::string& tablet_id, return false; } -Status DataDirManager::GetDirsForGroupUnlocked(int target_size, - vector* group_indices) { +void DataDirManager::GetDirsForGroupUnlocked(int target_size, + vector* group_indices) { DCHECK(dir_group_lock_.is_locked()); vector candidate_indices; for (auto& e : data_dir_by_uuid_idx_) { if (ContainsKey(failed_data_dirs_, e.first)) { continue; } - RETURN_NOT_OK(e.second->RefreshIsFull(DataDir::RefreshMode::ALWAYS)); - // TODO(awong): If a disk is unhealthy at the time of group creation, the - // resulting group may be below targeted size. Add functionality to resize - // groups. See KUDU-2040 for more details. - if (!e.second->is_full()) { + Status s = e.second->RefreshIsFull(DataDir::RefreshMode::ALWAYS); + WARN_NOT_OK(s, Substitute("failed to refresh fullness of $0", e.second->dir())); + if (s.ok() && !e.second->is_full()) { + // TODO(awong): If a disk is unhealthy at the time of group creation, the + // resulting group may be below targeted size. Add functionality to + // resize groups. See KUDU-2040 for more details. candidate_indices.push_back(e.first); } } @@ -796,7 +798,6 @@ Status DataDirManager::GetDirsForGroupUnlocked(int target_size, candidate_indices.erase(candidate_indices.begin() + 1); } } - return Status::OK(); } DataDir* DataDirManager::FindDataDirByUuidIndex(int uuid_idx) const { http://git-wip-us.apache.org/repos/asf/kudu/blob/e03b8e44/src/kudu/fs/data_dirs.h ---------------------------------------------------------------------- diff --git a/src/kudu/fs/data_dirs.h b/src/kudu/fs/data_dirs.h index 1dca128..0ed4389 100644 --- a/src/kudu/fs/data_dirs.h +++ b/src/kudu/fs/data_dirs.h @@ -414,7 +414,7 @@ class DataDirManager { // added. Although this function does not itself change DataDirManager state, // its expected usage warrants that it is called within the scope of a // lock_guard of dir_group_lock_. - Status GetDirsForGroupUnlocked(int target_size, std::vector* group_indices); + void GetDirsForGroupUnlocked(int target_size, std::vector* group_indices); // Goes through the data dirs in 'uuid_indices' and populates // 'healthy_indices' with those that haven't failed.