kudu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From t...@apache.org
Subject [2/2] kudu git commit: KUDU-2414. Scanner should not retry a SCANNER_EXPIRED error without reopening
Date Tue, 24 Apr 2018 01:05:24 GMT
KUDU-2414. Scanner should not retry a SCANNER_EXPIRED error without reopening

This fixes yet another tight retry loop seen while testing Impala. A scanner
expired on the server side, and the client got the expiration error. However,
the current code sees that as retryable, but should know that it's the kind
of retry that must re-open the scanner.

Without the fix, the new unit test fails with:

../../src/kudu/client/client-test.cc:1504: Failure
      Expected: "Not found: Scanner not found"
To be equal to: s.ToString()
      Which is: "Timed out: Scan RPC to 127.110.140.1:33568 timed out after 0.000s (SENT):
Not found: Scanner not found"
../../src/kudu/client/client-test.cc:1508: Failure
      Expected: 1
To be equal to: scan_rpcs->TotalCount()
      Which is: 104127

Change-Id: I5e09849d5b749db958c3a3df89fd8de9947ab991
Reviewed-on: http://gerrit.cloudera.org:8080/2711
Tested-by: Kudu Jenkins
Reviewed-by: Adar Dembo <adar@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/281ed652
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/281ed652
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/281ed652

Branch: refs/heads/master
Commit: 281ed6525261a88e0d6806af63c5a12b07e6418d
Parents: 9f70f76
Author: Todd Lipcon <todd@apache.org>
Authored: Mon Apr 23 16:09:10 2018 -0700
Committer: Todd Lipcon <todd@apache.org>
Committed: Tue Apr 24 00:55:43 2018 +0000

----------------------------------------------------------------------
 src/kudu/client/client-test.cc      | 45 +++++++++++++++++++++++++++++++-
 src/kudu/client/client.cc           |  5 ++--
 src/kudu/client/scanner-internal.cc | 14 ++++++++--
 src/kudu/client/scanner-internal.h  | 12 ++++++---
 4 files changed, 67 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/281ed652/src/kudu/client/client-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/client/client-test.cc b/src/kudu/client/client-test.cc
index de34183..816fb39 100644
--- a/src/kudu/client/client-test.cc
+++ b/src/kudu/client/client-test.cc
@@ -138,9 +138,9 @@ METRIC_DECLARE_counter(rpcs_queue_overflow);
 METRIC_DECLARE_histogram(handler_latency_kudu_master_MasterService_GetMasterRegistration);
 METRIC_DECLARE_histogram(handler_latency_kudu_master_MasterService_GetTableLocations);
 METRIC_DECLARE_histogram(handler_latency_kudu_master_MasterService_GetTabletLocations);
+METRIC_DECLARE_histogram(handler_latency_kudu_tserver_TabletServerService_Scan);
 
 using std::bind;
-using std::for_each;
 using std::function;
 using std::map;
 using std::pair;
@@ -1471,6 +1471,49 @@ TEST_F(ClientTest, TestScanFaultTolerance) {
   }
 }
 
+// For a non-fault-tolerant scan, if we see a SCANNER_EXPIRED error, we should
+// immediately fail, rather than retrying over and over on the same server.
+// Regression test for KUDU-2414.
+TEST_F(ClientTest, TestNonFaultTolerantScannerExpired) {
+  // Create test table and insert test rows.
+  const string kScanTable = "TestNonFaultTolerantScannerExpired";
+  shared_ptr<KuduTable> table;
+
+  const int kNumReplicas = 1;
+  ASSERT_NO_FATAL_FAILURE(CreateTable(kScanTable, kNumReplicas, {}, {}, &table));
+  ASSERT_NO_FATAL_FAILURE(InsertTestRows(table.get(), FLAGS_test_scan_num_rows));
+
+  KuduScanner scanner(table.get());
+  ASSERT_OK(scanner.SetTimeoutMillis(30 * 1000));
+  // Set a small batch size so it reads in multiple batches.
+  ASSERT_OK(scanner.SetBatchSizeBytes(1));
+
+  // First batch should be fine.
+  ASSERT_OK(scanner.Open());
+  KuduScanBatch batch;
+  Status s = scanner.NextBatch(&batch);
+  ASSERT_OK(s);
+
+  // Restart the server hosting the scan, so that on an attempt to continue
+  // the scan, it will get an error.
+  {
+    KuduTabletServer* kts_ptr;
+    ASSERT_OK(scanner.GetCurrentServer(&kts_ptr));
+    unique_ptr<KuduTabletServer> kts(kts_ptr);
+    ASSERT_OK(this->RestartTServerAndWait(kts->uuid()));
+  }
+
+  // We should now get the appropriate error.
+  s = scanner.NextBatch(&batch);
+  EXPECT_EQ("Not found: Scanner not found", s.ToString());
+
+  // It should not have performed any retries. Since we restarted the server above,
+  // we should see only one Scan RPC: the latest (failed) attempt above.
+  const auto& scan_rpcs = METRIC_handler_latency_kudu_tserver_TabletServerService_Scan.Instantiate(
+      cluster_->mini_tablet_server(0)->server()->metric_entity());
+  ASSERT_EQ(1, scan_rpcs->TotalCount());
+}
+
 TEST_F(ClientTest, TestNonCoveringRangePartitions) {
   // Create test table and insert test rows.
   const string kTableName = "TestNonCoveringRangePartitions";

http://git-wip-us.apache.org/repos/asf/kudu/blob/281ed652/src/kudu/client/client.cc
----------------------------------------------------------------------
diff --git a/src/kudu/client/client.cc b/src/kudu/client/client.cc
index 26512dc..b58d127 100644
--- a/src/kudu/client/client.cc
+++ b/src/kudu/client/client.cc
@@ -1521,7 +1521,8 @@ Status KuduScanner::NextBatch(KuduScanBatch* batch) {
 
       // Error handling.
       set<string> blacklist;
-      Status s = data_->HandleError(result, batch_deadline, &blacklist);
+      bool needs_reopen = false;
+      Status s = data_->HandleError(result, batch_deadline, &blacklist, &needs_reopen);
       if (!s.ok()) {
         LOG(WARNING) << "Scan at tablet server " << data_->ts_->ToString()
<< " of tablet "
                      << data_->DebugString() << " failed: " << result.status.ToString();
@@ -1534,7 +1535,7 @@ Status KuduScanner::NextBatch(KuduScanBatch* batch) {
         return data_->ReopenCurrentTablet(batch_deadline, &blacklist);
       }
 
-      if (blacklist.empty()) {
+      if (blacklist.empty() && !needs_reopen) {
         // If we didn't blacklist the current server, we can just retry again.
         continue;
       }

http://git-wip-us.apache.org/repos/asf/kudu/blob/281ed652/src/kudu/client/scanner-internal.cc
----------------------------------------------------------------------
diff --git a/src/kudu/client/scanner-internal.cc b/src/kudu/client/scanner-internal.cc
index cbbc605..af8f783 100644
--- a/src/kudu/client/scanner-internal.cc
+++ b/src/kudu/client/scanner-internal.cc
@@ -82,7 +82,12 @@ KuduScanner::Data::~Data() {
 
 Status KuduScanner::Data::HandleError(const ScanRpcStatus& err,
                                       const MonoTime& deadline,
-                                      set<string>* blacklist) {
+                                      set<string>* blacklist,
+                                      bool* needs_reopen) {
+  if (needs_reopen != nullptr) {
+    *needs_reopen = false;
+  }
+
   // If we timed out because of the overall deadline, we're done.
   // We didn't wait a full RPC timeout, though, so don't mark the tserver as failed.
   if (err.result == ScanRpcStatus::OVERALL_DEADLINE_EXCEEDED) {
@@ -110,6 +115,11 @@ Status KuduScanner::Data::HandleError(const ScanRpcStatus& err,
       mark_ts_failed = true;
       break;
     case ScanRpcStatus::SCANNER_EXPIRED:
+      // It's safe to retry on the same server, but the scanner needs to be
+      // re-opened.
+      if (needs_reopen != nullptr) {
+        *needs_reopen = true;
+      }
       break;
     case ScanRpcStatus::RPC_INVALID_AUTHENTICATION_TOKEN:
       // Usually this happens if doing an RPC call with an expired auth token.
@@ -466,7 +476,7 @@ Status KuduScanner::Data::OpenTablet(const string& partition_key,
       break;
     }
     scan_attempts_++;
-    RETURN_NOT_OK(HandleError(scan_status, deadline, blacklist));
+    RETURN_NOT_OK(HandleError(scan_status, deadline, blacklist, /* needs_reopen=*/ nullptr));
   }
 
   partition_pruner_.RemovePartitionKeyRange(remote_->partition().partition_key_end());

http://git-wip-us.apache.org/repos/asf/kudu/blob/281ed652/src/kudu/client/scanner-internal.h
----------------------------------------------------------------------
diff --git a/src/kudu/client/scanner-internal.h b/src/kudu/client/scanner-internal.h
index 145c6f2..9bfdabe 100644
--- a/src/kudu/client/scanner-internal.h
+++ b/src/kudu/client/scanner-internal.h
@@ -129,14 +129,18 @@ class KuduScanner::Data {
   // Called when KuduScanner::NextBatch or KuduScanner::Data::OpenTablet result in an RPC
or
   // server error.
   //
-  // If the provided 'status' indicates the error was retryable, then returns Status::OK()
+  // If the provided 'err' indicates the error was retryable, then returns Status::OK()
   // and potentially inserts the current server into 'blacklist' if the retry should be
-  // made on a different replica.
+  // made on a different replica. If the current server seems healthy, but the scanner expired,
+  // sets 'needs_reopen' to true to indicate that the client should re-open a new scanner.
+  //
+  // If 'needs_reopen' is nullptr, then it is not set.
   //
   // This function may also sleep in case the error suggests that backoff is necessary.
-  Status HandleError(const ScanRpcStatus& status,
+  Status HandleError(const ScanRpcStatus& err,
                      const MonoTime& deadline,
-                     std::set<std::string>* blacklist);
+                     std::set<std::string>* blacklist,
+                     bool* needs_reopen);
 
   // Opens the next tablet in the scan, or returns Status::NotFound if there are
   // no more tablets to scan.


Mime
View raw message