kudu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From wdberke...@apache.org
Subject [1/2] kudu git commit: KUDU-2364 Add extra check in ksck for tserver ID
Date Thu, 29 Mar 2018 22:21:51 GMT
Repository: kudu
Updated Branches:
  refs/heads/master c83f6eb35 -> 61d3fff2f


KUDU-2364 Add extra check in ksck for tserver ID

ksck did not validate tablet server ID when checking connectivity.
Whenever the TabletServer was nuked and readded, ksck would report the
connection was successful to the old tablet servers.

Now it will report an error like below:

./bin/kudu cluster ksck localhost
Connected to the Master
WARNING: Unable to connect to Tablet Server d92197fa2f034b33aa0bf998c41f637b
(va1022.halxg.cloudera.com:7073): Remote error: ID reported by tablet server
(50effcf1fe284ab693e7d1d43c5f18ad) doesn't match the expected ID:
d92197fa2f034b33aa0bf998c41f637b
Tablet Server Summary
               UUID               |          RPC Address           |      Status
----------------------------------+--------------------------------+-------------------
 22ec2c07d8aa4e1ba33a4eb42d4c3a21 | va1022.halxg.cloudera.com:7072 | HEALTHY
 50effcf1fe284ab693e7d1d43c5f18ad | va1022.halxg.cloudera.com:7073 | HEALTHY
 a05c93549cca4ceebc275651d8117065 | va1022.halxg.cloudera.com:7074 | HEALTHY
 d92197fa2f034b33aa0bf998c41f637b | va1022.halxg.cloudera.com:7073 | WRONG_SERVER_UUID
WARNING: Fetched info from 3 Tablet Servers, 1 weren't reachable
The cluster doesn't have any matching tables
==================
Errors:
==================
error fetching info from tablet servers: Could not gather complete information from all
tablet servers

FAILED
Runtime error: ksck discovered errors

Change-Id: Ia2c18ba7af8eaa6f5e4d7842f18754d2c1e32526
Reviewed-on: http://gerrit.cloudera.org:8080/9787
Reviewed-by: Adar Dembo <adar@cloudera.com>
Tested-by: Kudu Jenkins
Reviewed-by: Will Berkeley <wdberkeley@gmail.com>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/030a4d33
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/030a4d33
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/030a4d33

Branch: refs/heads/master
Commit: 030a4d330ead310adc2d66cde6af11cc334a175f
Parents: c83f6eb
Author: Attila Bukor <abukor@cloudera.com>
Authored: Wed Mar 28 21:42:59 2018 +0200
Committer: Will Berkeley <wdberkeley@gmail.com>
Committed: Thu Mar 29 08:28:38 2018 +0000

----------------------------------------------------------------------
 src/kudu/tools/ksck-test.cc        | 21 +++++++++++++
 src/kudu/tools/ksck.cc             | 56 +++++++++++++++++++++++++++++++++
 src/kudu/tools/ksck.h              | 20 ++++++++++++
 src/kudu/tools/ksck_remote-test.cc | 27 ++++++++++++++++
 src/kudu/tools/ksck_remote.cc      | 18 ++++++++---
 5 files changed, 138 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/030a4d33/src/kudu/tools/ksck-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck-test.cc b/src/kudu/tools/ksck-test.cc
index a20febe..74157e7 100644
--- a/src/kudu/tools/ksck-test.cc
+++ b/src/kudu/tools/ksck-test.cc
@@ -324,6 +324,27 @@ TEST_F(KsckTest, TestTabletServersOk) {
   ASSERT_OK(RunKsck());
 }
 
+TEST_F(KsckTest, TestWrongUUIDTabletServer) {
+  CreateOneTableOneTablet();
+
+  Status error = Status::RemoteError("ID reported by tablet server "
+                                     "doesn't match the expected ID");
+  static_pointer_cast<MockKsckTabletServer>(master_->tablet_servers_["ts-id-1"])
+    ->fetch_info_status_ = error;
+
+  ASSERT_OK(ksck_->CheckMasterRunning());
+  ASSERT_OK(ksck_->FetchTableAndTabletInfo());
+  ASSERT_TRUE(ksck_->FetchInfoFromTabletServers().IsNetworkError());
+  ASSERT_STR_CONTAINS(err_stream_.str(),
+    "Tablet Server Summary\n"
+    "  UUID   | RPC Address |      Status\n"
+    "---------+-------------+-------------------\n"
+    " ts-id-2 | <mock>      | HEALTHY\n"
+    " ts-id-0 | <mock>      | HEALTHY\n"
+    " ts-id-1 | <mock>      | WRONG_SERVER_UUID\n");
+}
+
+
 TEST_F(KsckTest, TestBadTabletServer) {
   CreateOneSmallReplicatedTable();
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/030a4d33/src/kudu/tools/ksck.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck.cc b/src/kudu/tools/ksck.cc
index df62828..3a14f3e 100644
--- a/src/kudu/tools/ksck.cc
+++ b/src/kudu/tools/ksck.cc
@@ -26,6 +26,7 @@
 #include <mutex>
 #include <numeric>
 #include <type_traits>
+#include <vector>
 
 #include <boost/optional.hpp> // IWYU pragma: keep
 #include <gflags/gflags.h>
@@ -183,16 +184,44 @@ Status Ksck::FetchInfoFromTabletServers() {
 
   AtomicInt<int32_t> bad_servers(0);
   VLOG(1) << "Fetching info from all the Tablet Servers";
+
+  vector<TabletServerSummary> tablet_server_summaries;
+  simple_spinlock tablet_server_summaries_lock;
+
   for (const KsckMaster::TSMap::value_type& entry : cluster_->tablet_servers()) {
+
     CHECK_OK(pool->SubmitFunc([&]() {
           Status s = ConnectToTabletServer(entry.second);
+          TabletServerSummary summary;
           if (!s.ok()) {
             bad_servers.Increment();
+            if (s.IsRemoteError()) {
+              summary.health = TabletServerHealth::WRONG_SERVER_UUID;
+            } else {
+              summary.health = TabletServerHealth::UNAVAILABLE;
+            }
+          } else {
+            summary.health = TabletServerHealth::HEALTHY;
           }
+
+          summary.uuid = entry.second->uuid();
+          summary.host_port = entry.second->address();
+
+          std::lock_guard<simple_spinlock> lock(tablet_server_summaries_lock);
+          tablet_server_summaries.emplace_back(std::move(summary));
         }));
   }
+
   pool->Wait();
 
+  std::sort(tablet_server_summaries.begin(), tablet_server_summaries.end(),
+            [](const TabletServerSummary& left, const TabletServerSummary& right)
{
+              return std::make_pair(left.health != TabletServerHealth::HEALTHY, left.host_port)
<
+                     std::make_pair(right.health != TabletServerHealth::HEALTHY, right.host_port);
+            });
+
+  CHECK_OK(PrintTabletServerSummaries(tablet_server_summaries, Out()));
+
   if (bad_servers.Load() == 0) {
     Out() << Substitute("Fetched info from all $0 Tablet Servers", servers_count) <<
endl;
     return Status::OK();
@@ -221,6 +250,33 @@ Status Ksck::ConnectToTabletServer(const shared_ptr<KsckTabletServer>&
ts) {
   return s;
 }
 
+Status Ksck::PrintTabletServerSummaries(const vector<TabletServerSummary>& tablet_server_summaries,
+                                        ostream& out) {
+  out << "Tablet Server Summary" << endl;
+  DataTable table({ "UUID", "RPC Address", "Status"});
+
+  for (const auto& ts : tablet_server_summaries) {
+    string status;
+    switch (ts.health) {
+      case TabletServerHealth::HEALTHY:
+        status = "HEALTHY";
+        break;
+      case TabletServerHealth::UNAVAILABLE:
+        status = "UNAVAILABLE";
+        break;
+      case TabletServerHealth::WRONG_SERVER_UUID:
+        status = "WRONG_SERVER_UUID";
+        break;
+      default:
+        LOG(FATAL) << "Unexpected health alert received";
+        break;
+    }
+    table.AddRow({ ts.uuid, ts.host_port, status });
+  }
+
+  return table.PrintTo(out);
+}
+
 Status Ksck::PrintTableSummaries(const vector<TableSummary>& table_summaries, ostream&
out) {
   out << "Table Summary" << endl;
   DataTable table({ "Name", "Status", "Total Tablets",

http://git-wip-us.apache.org/repos/asf/kudu/blob/030a4d33/src/kudu/tools/ksck.h
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck.h b/src/kudu/tools/ksck.h
index bf00d50..c55b758 100644
--- a/src/kudu/tools/ksck.h
+++ b/src/kudu/tools/ksck.h
@@ -477,6 +477,23 @@ class Ksck {
     CONSENSUS_MISMATCH,
   };
 
+  enum class TabletServerHealth {
+    // The tablet server is healthy
+    HEALTHY,
+
+    // The tablet server couldn't be connected to
+    UNAVAILABLE,
+
+    // The tablet server reported an unknown UUID
+    WRONG_SERVER_UUID,
+  };
+
+  struct TabletServerSummary {
+    std::string uuid;
+    std::string host_port;
+    TabletServerHealth health;
+  };
+
   // Summarizes the result of VerifyTable().
   struct TableSummary {
     std::string name;
@@ -510,6 +527,9 @@ class Ksck {
     }
   };
 
+  static Status PrintTabletServerSummaries(
+    const std::vector<TabletServerSummary>& tablet_server_summaries,
+    std::ostream& out);
   static Status PrintTableSummaries(const std::vector<TableSummary>& table_summaries,
                                     std::ostream& out);
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/030a4d33/src/kudu/tools/ksck_remote-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck_remote-test.cc b/src/kudu/tools/ksck_remote-test.cc
index 2f64c38..370d0e2 100644
--- a/src/kudu/tools/ksck_remote-test.cc
+++ b/src/kudu/tools/ksck_remote-test.cc
@@ -34,14 +34,17 @@
 #include "kudu/gutil/gscoped_ptr.h"
 #include "kudu/gutil/port.h"
 #include "kudu/gutil/ref_counted.h"
+#include "kudu/gutil/strings/substitute.h"
 #include "kudu/master/mini_master.h"
 #include "kudu/mini-cluster/internal_mini_cluster.h"
 #include "kudu/tools/data_gen_util.h"
 #include "kudu/tools/ksck.h"
 #include "kudu/tools/ksck_remote.h"
+#include "kudu/tserver/mini_tablet_server.h"
 #include "kudu/util/atomic.h"
 #include "kudu/util/countdown_latch.h"
 #include "kudu/util/monotime.h"
+#include "kudu/util/net/net_util.h"
 #include "kudu/util/promise.h"
 #include "kudu/util/random.h"
 #include "kudu/util/status.h"
@@ -222,6 +225,30 @@ TEST_F(RemoteKsckTest, TestTabletServersOk) {
   ASSERT_OK(ksck_->FetchInfoFromTabletServers());
 }
 
+TEST_F(RemoteKsckTest, TestTabletServerMismatchUUID) {
+  ASSERT_OK(ksck_->CheckMasterRunning());
+  ASSERT_OK(ksck_->FetchTableAndTabletInfo());
+
+  tserver::MiniTabletServer* tablet_server = mini_cluster_->mini_tablet_server(0);
+  string old_uuid = tablet_server->uuid();
+  string root_dir = mini_cluster_->GetTabletServerFsRoot(0) + "2";
+  HostPort address = HostPort(tablet_server->bound_rpc_addr());
+
+  tablet_server->Shutdown();
+  tserver::MiniTabletServer new_tablet_server(root_dir, address);
+  ASSERT_OK(new_tablet_server.Start());
+  ASSERT_OK(new_tablet_server.WaitStarted());
+
+  string new_uuid = new_tablet_server.uuid();
+
+  ASSERT_TRUE(ksck_->FetchInfoFromTabletServers().IsNetworkError());
+
+  string match_string = "Remote error: ID reported by tablet server "
+                        "($0) doesn't match the expected ID: $1";
+
+  ASSERT_STR_CONTAINS(err_stream_.str(), strings::Substitute(match_string, new_uuid, old_uuid));
+}
+
 TEST_F(RemoteKsckTest, TestTableConsistency) {
   MonoTime deadline = MonoTime::Now() + MonoDelta::FromSeconds(30);
   Status s;

http://git-wip-us.apache.org/repos/asf/kudu/blob/030a4d33/src/kudu/tools/ksck_remote.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck_remote.cc b/src/kudu/tools/ksck_remote.cc
index 27929ef..d571809 100644
--- a/src/kudu/tools/ksck_remote.cc
+++ b/src/kudu/tools/ksck_remote.cc
@@ -33,6 +33,7 @@
 #include "kudu/common/common.pb.h"
 #include "kudu/common/schema.h"
 #include "kudu/common/wire_protocol.h"
+#include "kudu/common/wire_protocol.pb.h"
 #include "kudu/consensus/consensus.pb.h"
 #include "kudu/consensus/consensus.proxy.h"
 #include "kudu/gutil/basictypes.h"
@@ -99,12 +100,18 @@ Status RemoteKsckTabletServer::FetchInfo() {
   state_ = kFetchFailed;
 
   {
-    tserver::PingRequestPB req;
-    tserver::PingResponsePB resp;
+    server::GetStatusRequestPB req;
+    server::GetStatusResponsePB resp;
     RpcController rpc;
     rpc.set_timeout(GetDefaultTimeout());
-    RETURN_NOT_OK_PREPEND(ts_proxy_->Ping(req, &resp, &rpc),
-                          "could not send Ping RPC to server");
+    RETURN_NOT_OK_PREPEND(generic_proxy_->GetStatus(req, &resp, &rpc),
+                          "could not get status from server");
+    string response_uuid = resp.status().node_instance().permanent_uuid();
+    if (response_uuid != uuid()) {
+      return Status::RemoteError(Substitute("ID reported by tablet server ($0) doesn't "
+                                 "match the expected ID: $1",
+                                 response_uuid, uuid()));
+    }
   }
 
   {
@@ -115,6 +122,9 @@ Status RemoteKsckTabletServer::FetchInfo() {
     req.set_need_schema_info(false);
     RETURN_NOT_OK_PREPEND(ts_proxy_->ListTablets(req, &resp, &rpc),
                           "could not list tablets");
+    if (resp.has_error()) {
+      return StatusFromPB(resp.error().status());
+    }
     tablet_status_map_.clear();
     for (auto& status : *resp.mutable_status_and_schema()) {
       tablet_status_map_[status.tablet_status().tablet_id()].Swap(status.mutable_tablet_status());


Mime
View raw message