kudu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From a...@apache.org
Subject kudu git commit: fs: defer failure from metadata load to bootstrap when data dir is missing
Date Sun, 21 Jan 2018 20:05:51 GMT
Repository: kudu
Updated Branches:
  refs/heads/master 6626e109d -> b81d5569a


fs: defer failure from metadata load to bootstrap when data dir is missing

An upcoming patch adds a CLI tool action to remove data directories. When a
data dir is removed, all tablets with data on it will fail. Today that
failure manifests as a FindOrDie in DataDirGroup::FromPB; we need to make
that a little bit more graceful.

This patch modifies the DataDirGroup FromPB/CopyToPB methods to return a
failure when a data dir is missing. It further changes TabletMetadata to
treat such failures as non-fatal, and adds checks to TabletBootstrap so that
the failures manifest there instead.

No tests in this patch because:
1. Andrew has already merged tablet-level tests for failed disks, and
2. The CLI tool patch adds coverage at the itest-level.

Change-Id: I1e8d5697c2bb08287cce11fbdab6fb8d6e37d1ad
Reviewed-on: http://gerrit.cloudera.org:8080/8376
Reviewed-by: Todd Lipcon <todd@apache.org>
Reviewed-by: Andrew Wong <awong@cloudera.com>
Tested-by: Kudu Jenkins


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/b81d5569
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/b81d5569
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/b81d5569

Branch: refs/heads/master
Commit: b81d5569a0fa9ee4d787b93703f210c2e2f8d93d
Parents: 6626e10
Author: Adar Dembo <adar@cloudera.com>
Authored: Tue Oct 24 18:18:39 2017 -0700
Committer: Adar Dembo <adar@cloudera.com>
Committed: Sun Jan 21 20:05:22 2018 +0000

----------------------------------------------------------------------
 src/kudu/fs/block_manager-stress-test.cc |  2 +-
 src/kudu/fs/block_manager-test.cc        | 10 +--
 src/kudu/fs/data_dirs-test.cc            | 11 ++--
 src/kudu/fs/data_dirs.cc                 | 88 +++++++++++++++++++++++----
 src/kudu/fs/data_dirs.h                  | 58 +++++++++---------
 src/kudu/fs/log_block_manager-test.cc    |  2 +-
 src/kudu/tablet/tablet_bootstrap.cc      | 13 +++-
 src/kudu/tablet/tablet_metadata.cc       | 11 ++--
 src/kudu/tserver/tablet_copy_client.cc   |  4 +-
 src/kudu/tserver/ts_tablet_manager.cc    |  8 ---
 10 files changed, 137 insertions(+), 70 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/b81d5569/src/kudu/fs/block_manager-stress-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/fs/block_manager-stress-test.cc b/src/kudu/fs/block_manager-stress-test.cc
index 6ae4a2c..f5bedf3 100644
--- a/src/kudu/fs/block_manager-stress-test.cc
+++ b/src/kudu/fs/block_manager-stress-test.cc
@@ -150,7 +150,7 @@ class BlockManagerStressTest : public KuduTest {
     bm_.reset(CreateBlockManager());
     bm_->Open(nullptr);
     dd_manager_->CreateDataDirGroup(test_tablet_name_);
-    CHECK(dd_manager_->GetDataDirGroupPB(test_tablet_name_, &test_group_pb_));
+    CHECK_OK(dd_manager_->GetDataDirGroupPB(test_tablet_name_, &test_group_pb_));
   }
 
   virtual void TearDown() override {

http://git-wip-us.apache.org/repos/asf/kudu/blob/b81d5569/src/kudu/fs/block_manager-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/fs/block_manager-test.cc b/src/kudu/fs/block_manager-test.cc
index 46c8fec..e17a61e 100644
--- a/src/kudu/fs/block_manager-test.cc
+++ b/src/kudu/fs/block_manager-test.cc
@@ -119,9 +119,9 @@ class BlockManagerTest : public KuduTest {
   virtual void SetUp() override {
     // Pass in a report to prevent the block manager from logging unnecessarily.
     FsReport report;
-    CHECK_OK(bm_->Open(&report));
-    CHECK_OK(dd_manager_->CreateDataDirGroup(test_tablet_name_));
-    CHECK(dd_manager_->GetDataDirGroupPB(test_tablet_name_, &test_group_pb_));
+    ASSERT_OK(bm_->Open(&report));
+    ASSERT_OK(dd_manager_->CreateDataDirGroup(test_tablet_name_));
+    ASSERT_OK(dd_manager_->GetDataDirGroupPB(test_tablet_name_, &test_group_pb_));
   }
 
   void DistributeBlocksAcrossDirs(int num_dirs, int num_blocks_per_dir) {
@@ -231,7 +231,7 @@ void BlockManagerTest<LogBlockManager>::SetUp() {
   ASSERT_OK(dd_manager_->CreateDataDirGroup(test_tablet_name_));
 
   // Store the DataDirGroupPB for tests that reopen the block manager.
-  CHECK(dd_manager_->GetDataDirGroupPB(test_tablet_name_, &test_group_pb_));
+  ASSERT_OK(dd_manager_->GetDataDirGroupPB(test_tablet_name_, &test_group_pb_));
 }
 
 template <>
@@ -486,7 +486,7 @@ TYPED_TEST(BlockManagerTest, CreateBlocksInDataDirs) {
 
   DataDirGroupPB test_group_pb;
   // Check that the in-memory DataDirGroup did not change.
-  ASSERT_TRUE(this->dd_manager_->GetDataDirGroupPB(
+  ASSERT_OK(this->dd_manager_->GetDataDirGroupPB(
       this->test_tablet_name_, &test_group_pb));
   ASSERT_TRUE(MessageDifferencer::Equals(test_group_pb, this->test_group_pb_));
 }

http://git-wip-us.apache.org/repos/asf/kudu/blob/b81d5569/src/kudu/fs/data_dirs-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/fs/data_dirs-test.cc b/src/kudu/fs/data_dirs-test.cc
index e5ea7e2..449780a 100644
--- a/src/kudu/fs/data_dirs-test.cc
+++ b/src/kudu/fs/data_dirs-test.cc
@@ -116,7 +116,7 @@ TEST_F(DataDirsTest, TestCreateGroup) {
 
   DataDirGroupPB orig_pb;
   ASSERT_OK(dd_manager_->CreateDataDirGroup(test_tablet_name_));
-  ASSERT_TRUE(dd_manager_->GetDataDirGroupPB(test_tablet_name_, &orig_pb));
+  ASSERT_OK(dd_manager_->GetDataDirGroupPB(test_tablet_name_, &orig_pb));
 
   // Ensure that the DataDirManager will not create a group for a tablet that
   // it already knows about.
@@ -125,7 +125,7 @@ TEST_F(DataDirsTest, TestCreateGroup) {
   ASSERT_STR_CONTAINS(s.ToString(), "Tried to create directory group for tablet "
                                     "but one is already registered");
   DataDirGroupPB pb;
-  ASSERT_TRUE(dd_manager_->GetDataDirGroupPB(test_tablet_name_, &pb));
+  ASSERT_OK(dd_manager_->GetDataDirGroupPB(test_tablet_name_, &pb));
 
   // Verify that the data directory is unchanged after failing to create an
   // existing tablet.
@@ -153,7 +153,7 @@ TEST_F(DataDirsTest, TestLoadFromPB) {
   // Create a PB, delete the group, then load the group from the PB.
   DataDirGroupPB orig_pb;
   ASSERT_OK(dd_manager_->CreateDataDirGroup(test_tablet_name_));
-  ASSERT_TRUE(dd_manager_->GetDataDirGroupPB(test_tablet_name_, &orig_pb));
+  ASSERT_OK(dd_manager_->GetDataDirGroupPB(test_tablet_name_, &orig_pb));
   dd_manager_->DeleteDataDirGroup(test_tablet_name_);
   ASSERT_OK(dd_manager_->LoadDataDirGroupFromPB(test_tablet_name_, orig_pb));
 
@@ -171,8 +171,7 @@ TEST_F(DataDirsTest, TestLoadFromPB) {
   // knows about the tablet.
   Status s = dd_manager_->LoadDataDirGroupFromPB(test_tablet_name_, orig_pb);
   ASSERT_TRUE(s.IsAlreadyPresent()) << s.ToString();
-  ASSERT_STR_CONTAINS(s.ToString(), "Tried to load directory group for tablet but "
-                                    "one is already registered");
+  ASSERT_STR_CONTAINS(s.ToString(), "tried to load directory group for tablet");
 }
 
 TEST_F(DataDirsTest, TestDeleteDataDirGroup) {
@@ -237,7 +236,7 @@ TEST_F(DataDirsTest, TestFailedDirNotAddedToGroup) {
         entity_->FindOrNull(METRIC_data_dirs_failed).get())->value());
   ASSERT_OK(dd_manager_->CreateDataDirGroup(test_tablet_name_));
   DataDirGroupPB pb;
-  ASSERT_TRUE(dd_manager_->GetDataDirGroupPB(test_tablet_name_, &pb));
+  ASSERT_OK(dd_manager_->GetDataDirGroupPB(test_tablet_name_, &pb));
   ASSERT_EQ(kNumDirs - 1, pb.uuids_size());
 
   // Check that all uuid_indices are valid and are not in the failed directory

http://git-wip-us.apache.org/repos/asf/kudu/blob/b81d5569/src/kudu/fs/data_dirs.cc
----------------------------------------------------------------------
diff --git a/src/kudu/fs/data_dirs.cc b/src/kudu/fs/data_dirs.cc
index 32ee476..46cae9c 100644
--- a/src/kudu/fs/data_dirs.cc
+++ b/src/kudu/fs/data_dirs.cc
@@ -38,6 +38,7 @@
 
 #include "kudu/fs/block_manager.h"
 #include "kudu/fs/block_manager_util.h"
+#include "kudu/fs/fs.pb.h"
 #include "kudu/gutil/bind.h"
 #include "kudu/gutil/gscoped_ptr.h"
 #include "kudu/gutil/integral_types.h"
@@ -187,6 +188,10 @@ void DeleteTmpFilesRecursively(Env* env, const string& path) {
 
 } // anonymous namespace
 
+////////////////////////////////////////////////////////////
+// DataDirMetrics
+////////////////////////////////////////////////////////////
+
 #define GINIT(x) x(METRIC_##x.Instantiate(entity, 0))
 DataDirMetrics::DataDirMetrics(const scoped_refptr<MetricEntity>& entity)
   : GINIT(data_dirs_failed),
@@ -194,6 +199,10 @@ DataDirMetrics::DataDirMetrics(const scoped_refptr<MetricEntity>&
entity)
 }
 #undef GINIT
 
+////////////////////////////////////////////////////////////
+// DataDir
+////////////////////////////////////////////////////////////
+
 DataDir::DataDir(Env* env,
                  DataDirMetrics* metrics,
                  DataDirFsType fs_type,
@@ -280,10 +289,61 @@ Status DataDir::RefreshIsFull(RefreshMode mode) {
   return Status::OK();
 }
 
+////////////////////////////////////////////////////////////
+// DataDirGroup
+////////////////////////////////////////////////////////////
+
+DataDirGroup::DataDirGroup() {}
+
+DataDirGroup::DataDirGroup(vector<int> uuid_indices)
+    : uuid_indices_(std::move(uuid_indices)) {}
+
+Status DataDirGroup::LoadFromPB(const UuidIndexByUuidMap& uuid_idx_by_uuid,
+                                const DataDirGroupPB& pb) {
+  vector<int> uuid_indices;
+  for (const auto& uuid : pb.uuids()) {
+    int uuid_idx;
+    if (!FindCopy(uuid_idx_by_uuid, uuid, &uuid_idx)) {
+      return Status::NotFound(Substitute(
+          "could not find data dir with uuid $0", uuid));
+    }
+    uuid_indices.emplace_back(uuid_idx);
+  }
+
+  uuid_indices_ = std::move(uuid_indices);
+  return Status::OK();
+}
+
+Status DataDirGroup::CopyToPB(const UuidByUuidIndexMap& uuid_by_uuid_idx,
+                              DataDirGroupPB* pb) const {
+  DCHECK(pb);
+  DataDirGroupPB group;
+  for (auto uuid_idx : uuid_indices_) {
+    string uuid;
+    if (!FindCopy(uuid_by_uuid_idx, uuid_idx, &uuid)) {
+      return Status::NotFound(Substitute(
+          "could not find data dir with uuid index $0", uuid_idx));
+    }
+    group.mutable_uuids()->Add(std::move(uuid));
+  }
+
+  *pb = std::move(group);
+  return Status::OK();
+}
+
+////////////////////////////////////////////////////////////
+// DataDirManagerOptions
+////////////////////////////////////////////////////////////
+
 DataDirManagerOptions::DataDirManagerOptions()
-  : block_manager_type(FLAGS_block_manager),
-    read_only(false),
-    update_on_disk(false) {}
+    : block_manager_type(FLAGS_block_manager),
+      read_only(false),
+      update_on_disk(false) {
+}
+
+////////////////////////////////////////////////////////////
+// DataDirManager
+////////////////////////////////////////////////////////////
 
 vector<string> DataDirManager::GetRootNames(const CanonicalizedRootsList& root_list)
{
   vector<string> roots;
@@ -782,13 +842,16 @@ Status DataDirManager::Open() {
 Status DataDirManager::LoadDataDirGroupFromPB(const std::string& tablet_id,
                                               const DataDirGroupPB& pb) {
   std::lock_guard<percpu_rwlock> lock(dir_group_lock_);
-  DataDirGroup group_from_pb = DataDirGroup::FromPB(pb, idx_by_uuid_);
+  DataDirGroup group_from_pb;
+  RETURN_NOT_OK_PREPEND(group_from_pb.LoadFromPB(idx_by_uuid_, pb), Substitute(
+      "could not load data dir group for tablet $0", tablet_id));
   DataDirGroup* other = InsertOrReturnExisting(&group_by_tablet_map_,
                                                tablet_id,
                                                group_from_pb);
   if (other != nullptr) {
-    return Status::AlreadyPresent("Tried to load directory group for tablet but one is already
"
-                                  "registered", tablet_id);
+    return Status::AlreadyPresent(Substitute(
+        "tried to load directory group for tablet $0 but one is already registered",
+        tablet_id));
   }
   for (int uuid_idx : group_from_pb.uuid_indices()) {
     InsertOrDie(&FindOrDie(tablets_by_uuid_idx_map_, uuid_idx), tablet_id);
@@ -919,15 +982,16 @@ void DataDirManager::DeleteDataDirGroup(const std::string& tablet_id)
{
   group_by_tablet_map_.erase(tablet_id);
 }
 
-bool DataDirManager::GetDataDirGroupPB(const std::string& tablet_id,
-                                       DataDirGroupPB* pb) const {
+Status DataDirManager::GetDataDirGroupPB(const string& tablet_id,
+                                         DataDirGroupPB* pb) const {
   shared_lock<rw_spinlock> lock(dir_group_lock_.get_lock());
   const DataDirGroup* group = FindOrNull(group_by_tablet_map_, tablet_id);
-  if (group != nullptr) {
-    group->CopyToPB(uuid_by_idx_, pb);
-    return true;
+  if (group == nullptr) {
+    return Status::NotFound(Substitute(
+        "could not find data dir group for tablet $0", tablet_id));
   }
-  return false;
+  RETURN_NOT_OK(group->CopyToPB(uuid_by_idx_, pb));
+  return Status::OK();
 }
 
 void DataDirManager::GetDirsForGroupUnlocked(int target_size,

http://git-wip-us.apache.org/repos/asf/kudu/blob/b81d5569/src/kudu/fs/data_dirs.h
----------------------------------------------------------------------
diff --git a/src/kudu/fs/data_dirs.h b/src/kudu/fs/data_dirs.h
index d886f91..8dcf371 100644
--- a/src/kudu/fs/data_dirs.h
+++ b/src/kudu/fs/data_dirs.h
@@ -26,14 +26,11 @@
 #include <utility>
 #include <vector>
 
-#include <glog/logging.h>
 #include <gtest/gtest_prod.h>
 
-#include "kudu/fs/fs.pb.h"
 #include "kudu/gutil/callback.h"
-#include "kudu/gutil/ref_counted.h"
 #include "kudu/gutil/macros.h"
-#include "kudu/gutil/map-util.h"
+#include "kudu/gutil/ref_counted.h"
 #include "kudu/util/locks.h"
 #include "kudu/util/metrics.h"
 #include "kudu/util/monotime.h"
@@ -41,6 +38,8 @@
 #include "kudu/util/status.h"
 
 namespace kudu {
+
+class DataDirGroupPB;
 class Env;
 class ThreadPool;
 
@@ -77,33 +76,29 @@ namespace internal {
 // The same directory may appear in multiple DataDirGroups.
 class DataDirGroup {
  public:
-  explicit DataDirGroup(std::vector<int> uuid_indices)
-      : uuid_indices_(std::move(uuid_indices)) {}
-
-  static DataDirGroup FromPB(const DataDirGroupPB& pb,
-                             const UuidIndexByUuidMap& uuid_idx_by_uuid) {
-    std::vector<int> uuid_indices;
-    for (const std::string& uuid : pb.uuids()) {
-      uuid_indices.push_back(FindOrDie(uuid_idx_by_uuid, uuid));
-    }
-    return DataDirGroup(std::move(uuid_indices));
-  }
+  DataDirGroup();
 
-  void CopyToPB(const UuidByUuidIndexMap& uuid_by_uuid_idx,
-                DataDirGroupPB* pb) const {
-    DCHECK(pb);
-    DataDirGroupPB group;
-    for (int uuid_idx : uuid_indices_) {
-      group.add_uuids(FindOrDie(uuid_by_uuid_idx, uuid_idx));
-    }
-    pb->Swap(&group);
-  }
+  explicit DataDirGroup(std::vector<int> uuid_indices);
+
+  // Reloads the DataDirGroup with UUID indices for the UUIDs in 'pb' by
+  // looking them up in 'uuid_idx_by_uuid'.
+  //
+  // Returns an error if a uuid cannot be found.
+  Status LoadFromPB(const UuidIndexByUuidMap& uuid_idx_by_uuid,
+                    const DataDirGroupPB& pb);
+
+  // Writes this group's UUIDs to 'pb', looking them up via index in
+  // 'uuid_by_uuid_idx'.
+  //
+  // Returns an error if an index cannot be found.
+  Status CopyToPB(const UuidByUuidIndexMap& uuid_by_uuid_idx,
+                  DataDirGroupPB* pb) const;
 
   const std::vector<int>& uuid_indices() const { return uuid_indices_; }
 
  private:
   // UUID indices corresponding to the data directories within the group.
-  const std::vector<int> uuid_indices_;
+  std::vector<int> uuid_indices_;
 };
 
 }  // namespace internal
@@ -290,10 +285,17 @@ class DataDirManager {
   // Deserializes a DataDirGroupPB and associates the resulting DataDirGroup
   // with a tablet_id.
   //
-  // Results in an error if the tablet already exists.
+  // Returns an error if the tablet already exists or if a data dir in the
+  // group is missing.
   Status LoadDataDirGroupFromPB(const std::string& tablet_id,
                                 const DataDirGroupPB& pb);
 
+  // Serializes the DataDirGroupPB associated with the given tablet_id.
+  //
+  // Returns an error if the tablet was not already registered or if a data dir
+  // is missing.
+  Status GetDataDirGroupPB(const std::string& tablet_id, DataDirGroupPB* pb) const;
+
   // Creates a new data dir group for the specified tablet. Adds data
   // directories to this new group until the limit specified by
   // fs_target_data_dirs_per_tablet, or until there is no more space.
@@ -313,10 +315,6 @@ class DataDirManager {
   // and data dir to tablet set are cleared of all references to the tablet.
   void DeleteDataDirGroup(const std::string& tablet_id);
 
-  // Serializes the DataDirGroupPB associated with the given tablet_id. Returns
-  // false if none exist.
-  bool GetDataDirGroupPB(const std::string& tablet_id, DataDirGroupPB* pb) const;
-
   // Returns a random directory from the specfied option's data dir group. If
   // there is no room in the group, returns an error.
   Status GetNextDataDir(const CreateBlockOptions& opts, DataDir** dir);

http://git-wip-us.apache.org/repos/asf/kudu/blob/b81d5569/src/kudu/fs/log_block_manager-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/fs/log_block_manager-test.cc b/src/kudu/fs/log_block_manager-test.cc
index eca2ccc..460d9d4 100644
--- a/src/kudu/fs/log_block_manager-test.cc
+++ b/src/kudu/fs/log_block_manager-test.cc
@@ -116,7 +116,7 @@ class LogBlockManagerTest : public KuduTest {
     FsReport report;
     ASSERT_OK(bm_->Open(&report));
     ASSERT_OK(dd_manager_->CreateDataDirGroup(test_tablet_name_));
-    ASSERT_TRUE(dd_manager_->GetDataDirGroupPB(test_tablet_name_, &test_group_pb_));
+    ASSERT_OK(dd_manager_->GetDataDirGroupPB(test_tablet_name_, &test_group_pb_));
   }
 
  protected:

http://git-wip-us.apache.org/repos/asf/kudu/blob/b81d5569/src/kudu/tablet/tablet_bootstrap.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet_bootstrap.cc b/src/kudu/tablet/tablet_bootstrap.cc
index 371a6f0..759b1b3 100644
--- a/src/kudu/tablet/tablet_bootstrap.cc
+++ b/src/kudu/tablet/tablet_bootstrap.cc
@@ -49,6 +49,8 @@
 #include "kudu/consensus/opid.pb.h"
 #include "kudu/consensus/opid_util.h"
 #include "kudu/consensus/raft_consensus.h"
+#include "kudu/fs/data_dirs.h"
+#include "kudu/fs/fs.pb.h"
 #include "kudu/fs/fs_manager.h"
 #include "kudu/gutil/bind.h"
 #include "kudu/gutil/gscoped_ptr.h"
@@ -88,7 +90,6 @@
 #include "kudu/util/pb_util.h"
 #include "kudu/util/stopwatch.h"
 
-
 DECLARE_int32(group_commit_queue_size_bytes);
 
 DEFINE_bool(skip_remove_old_recovery_dir, false,
@@ -552,6 +553,16 @@ Status TabletBootstrap::RunBootstrap(shared_ptr<Tablet>* rebuilt_tablet,
     VLOG_WITH_PREFIX(1) << "Tablet Metadata: " << SecureDebugString(super_block);
   }
 
+
+  // Ensure the tablet's data dirs are present and healthy before it is opened.
+  DataDirGroupPB data_dir_group;
+  RETURN_NOT_OK_PREPEND(
+      tablet_meta_->fs_manager()->dd_manager()->GetDataDirGroupPB(tablet_id, &data_dir_group),
+      "error retrieving tablet data dir group (one or more data dirs may have been removed)");
+  if (tablet_meta_->fs_manager()->dd_manager()->IsTabletInFailedDir(tablet_id))
{
+    return Status::IOError("some tablet data is in a failed directory");
+  }
+
   RETURN_NOT_OK(flushed_stores_.InitFrom(*tablet_meta_.get()));
 
   bool has_blocks;

http://git-wip-us.apache.org/repos/asf/kudu/blob/b81d5569/src/kudu/tablet/tablet_metadata.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet_metadata.cc b/src/kudu/tablet/tablet_metadata.cc
index 92eb8f8..641c925 100644
--- a/src/kudu/tablet/tablet_metadata.cc
+++ b/src/kudu/tablet/tablet_metadata.cc
@@ -217,7 +217,7 @@ Status TabletMetadata::DeleteTabletData(TabletDataState delete_type,
 
   // Keep a copy of the old data dir group in case of flush failure.
   DataDirGroupPB pb;
-  bool old_group_exists = fs_manager_->dd_manager()->GetDataDirGroupPB(tablet_id_,
&pb);
+  bool old_group_exists = fs_manager_->dd_manager()->GetDataDirGroupPB(tablet_id_,
&pb).ok();
 
   // Remove the tablet's data dir group tracked by the DataDirManager.
   fs_manager_->dd_manager()->DeleteDataDirGroup(tablet_id_);
@@ -420,8 +420,11 @@ Status TabletMetadata::LoadFromSuperBlock(const TabletSuperBlockPB&
superblock)
     fs_manager()->block_manager()->NotifyBlockId(max_block_id);
 
     if (superblock.has_data_dir_group()) {
-      RETURN_NOT_OK_PREPEND(fs_manager_->dd_manager()->LoadDataDirGroupFromPB(
-          tablet_id_, superblock.data_dir_group()), "Failed to load DataDirGroup from superblock");
+      // An error loading the data dir group is non-fatal, it just means the
+      // tablet will fail to bootstrap later.
+      WARN_NOT_OK(fs_manager_->dd_manager()->LoadDataDirGroupFromPB(
+          tablet_id_, superblock.data_dir_group()),
+          "failed to load DataDirGroup from superblock");
     } else if (tablet_data_state_ == TABLET_DATA_READY) {
       // If the superblock does not contain a DataDirGroup, this server has
       // likely been upgraded from before 1.5.0. Create a new DataDirGroup for
@@ -675,7 +678,7 @@ Status TabletMetadata::ToSuperBlockUnlocked(TabletSuperBlockPB* super_block,
   // Serialize the tablet's DataDirGroupPB if one exists. One may not exist if
   // this is called during a tablet deletion.
   DataDirGroupPB group_pb;
-  if (fs_manager_->dd_manager()->GetDataDirGroupPB(tablet_id_, &group_pb)) {
+  if (fs_manager_->dd_manager()->GetDataDirGroupPB(tablet_id_, &group_pb).ok())
{
     pb.mutable_data_dir_group()->Swap(&group_pb);
   }
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/b81d5569/src/kudu/tserver/tablet_copy_client.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/tablet_copy_client.cc b/src/kudu/tserver/tablet_copy_client.cc
index 4e16e7d..ef88b0e 100644
--- a/src/kudu/tserver/tablet_copy_client.cc
+++ b/src/kudu/tserver/tablet_copy_client.cc
@@ -350,8 +350,8 @@ Status TabletCopyClient::Start(const HostPort& copy_source_addr,
                                             superblock_->tombstone_last_logged_opid(),
                                             &meta_));
   }
-  CHECK(fs_manager_->dd_manager()->GetDataDirGroupPB(tablet_id_,
-                                                     superblock_->mutable_data_dir_group()));
+  CHECK_OK(fs_manager_->dd_manager()->GetDataDirGroupPB(
+      tablet_id_, superblock_->mutable_data_dir_group()));
 
   // Create the ConsensusMetadata before returning from Start() so that it's
   // possible to vote while we are copying the replica for the first time.

http://git-wip-us.apache.org/repos/asf/kudu/blob/b81d5569/src/kudu/tserver/ts_tablet_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/ts_tablet_manager.cc b/src/kudu/tserver/ts_tablet_manager.cc
index a3c2887..0c8d8c4 100644
--- a/src/kudu/tserver/ts_tablet_manager.cc
+++ b/src/kudu/tserver/ts_tablet_manager.cc
@@ -941,14 +941,6 @@ void TSTabletManager::OpenTablet(const scoped_refptr<TabletReplica>&
replica,
     return;
   }
 
-  // If the tablet is in a failed directory, don't bother bootstrapping.
-  if (fs_manager_->dd_manager()->IsTabletInFailedDir(tablet_id)) {
-    LOG(ERROR) << LogPrefix(tablet_id) << "aborting tablet bootstrap: tablet
"
-                                          "has data in a failed directory";
-    s = Status::IOError("Tablet data is in a failed directory");
-    return;
-  }
-
   consensus::ConsensusBootstrapInfo bootstrap_info;
   LOG_TIMING_PREFIX(INFO, LogPrefix(tablet_id), "bootstrapping tablet") {
     // Disable tracing for the bootstrap, since this would result in


Mime
View raw message