kudu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mpe...@apache.org
Subject [kudu] 02/03: tablet_service: improve error handling specificity
Date Tue, 26 Mar 2019 22:59:16 GMT
This is an automated email from the ASF dual-hosted git repository.

mpercy pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit c355e8ed77b823783d670ba7c090b12355d6340e
Author: Mike Percy <mpercy@apache.org>
AuthorDate: Fri Mar 22 17:30:03 2019 -0700

    tablet_service: improve error handling specificity
    
    The error handling in TabletServiceImpl::HandleNewScanRequest() was
    convoluted. It's still not particularly inspired, but this patch makes
    several clarity and maintainability improvements:
    
    * Instead of making it the default error returned from
      HandleNewScanRequest(), localize the return of
      TabletServerErrorPB::MISMATCHED_SCHEMA to where Iterator::Init()
      returns an InvalidArgument, since that comes from a
      GetMappedReadProjection() failure.
    * Use an INVALID_SCAN_SPEC error for the case where the user specifies
      an unrecognized order mode, which is consistent with when the user
      specifies an unknown read mode or when the user specifies order mode
      ORDERED without specifying a snapshot scan. Also, update the comments
      in the .proto file to reflect the current usage.
    * Localize assignment of TabletServerErrorPB::Code as much as possible
      in HandleNewScanRequest() by plumbing an `error_code` pointer into
      HandleScanAtSnapshot() so that errors in that method can be specific
      to the type of problem observed.
    
    Change-Id: I1e6f7b01336e78ea8f37cb17b2c4485138e23581
    Reviewed-on: http://gerrit.cloudera.org:8080/12840
    Tested-by: Kudu Jenkins
    Reviewed-by: Adar Dembo <adar@cloudera.com>
---
 src/kudu/tools/ksck_remote-test.cc          |  4 +-
 src/kudu/tserver/tablet_server-test-base.cc | 27 +++++-----
 src/kudu/tserver/tablet_server-test-base.h  |  9 +++-
 src/kudu/tserver/tablet_server-test.cc      | 13 +++++
 src/kudu/tserver/tablet_service.cc          | 79 ++++++++++++++---------------
 src/kudu/tserver/tablet_service.h           |  3 +-
 src/kudu/tserver/tserver.proto              |  5 +-
 7 files changed, 81 insertions(+), 59 deletions(-)

diff --git a/src/kudu/tools/ksck_remote-test.cc b/src/kudu/tools/ksck_remote-test.cc
index b02c0e8..55c1ca7 100644
--- a/src/kudu/tools/ksck_remote-test.cc
+++ b/src/kudu/tools/ksck_remote-test.cc
@@ -475,8 +475,8 @@ TEST_F(RemoteKsckTest, TestChecksumSnapshotLastingLongerThanAHM) {
   ASSERT_TRUE(s.IsAborted()) << s.ToString();
   ASSERT_OK(ksck_->PrintResults());
   ASSERT_STR_CONTAINS(err_stream_.str(),
-                      "Invalid argument: Snapshot timestamp is earlier than "
-                      "the ancient history mark.");
+                      "Invalid argument: cannot verify timestamp: "
+                      "Snapshot timestamp is earlier than the ancient history mark.");
 
   // Now let's try again using the special current timestamp, which will run
   // checksums using timestamps updated from the servers, and should succeed.
diff --git a/src/kudu/tserver/tablet_server-test-base.cc b/src/kudu/tserver/tablet_server-test-base.cc
index f9bf79d..524a11a 100644
--- a/src/kudu/tserver/tablet_server-test-base.cc
+++ b/src/kudu/tserver/tablet_server-test-base.cc
@@ -433,21 +433,13 @@ void TabletServerTestBase::VerifyRows(const Schema& schema,
   ASSERT_EQ(count, expected.size());
 }
 
-// Verifies that a simple scan request fails with the specified error code/message.
-void TabletServerTestBase::VerifyScanRequestFailure(
-    const Schema& projection,
-    TabletServerErrorPB::Code expected_code,
-    const char *expected_message) {
-  ScanRequestPB req;
+void TabletServerTestBase::VerifyScanRequestFailure(const ScanRequestPB& req,
+                                                    TabletServerErrorPB::Code expected_code,
+                                                    const char *expected_message) {
   ScanResponsePB resp;
   RpcController rpc;
 
-  NewScanRequestPB* scan = req.mutable_new_scan_request();
-  scan->set_tablet_id(kTabletId);
-  ASSERT_OK(SchemaToColumnPBs(projection, scan->mutable_projected_columns()));
-  req.set_call_seq_id(0);
-
-  // Send the call
+  // Send the call.
   {
     SCOPED_TRACE(SecureDebugString(req));
     ASSERT_OK(proxy_->Scan(req, &resp, &rpc));
@@ -458,6 +450,17 @@ void TabletServerTestBase::VerifyScanRequestFailure(
   }
 }
 
+void TabletServerTestBase::VerifyScanRequestFailure(const Schema& projection,
+                                                    TabletServerErrorPB::Code expected_code,
+                                                    const char *expected_message) {
+  ScanRequestPB req;
+  NewScanRequestPB* scan = req.mutable_new_scan_request();
+  scan->set_tablet_id(kTabletId);
+  ASSERT_OK(SchemaToColumnPBs(projection, scan->mutable_projected_columns()));
+  req.set_call_seq_id(0);
+  NO_FATALS(VerifyScanRequestFailure(req, expected_code, expected_message));
+}
+
 Status TabletServerTestBase::FillNewScanRequest(ReadMode read_mode, NewScanRequestPB* scan)
const {
   scan->set_tablet_id(kTabletId);
   scan->set_read_mode(read_mode);
diff --git a/src/kudu/tserver/tablet_server-test-base.h b/src/kudu/tserver/tablet_server-test-base.h
index 2f6af1d..b10e3b5 100644
--- a/src/kudu/tserver/tablet_server-test-base.h
+++ b/src/kudu/tserver/tablet_server-test-base.h
@@ -111,7 +111,14 @@ class TabletServerTestBase : public KuduTest {
   // Verifies that a set of expected rows (key, value) is present in the tablet.
   void VerifyRows(const Schema& schema, const std::vector<KeyValue>& expected);
 
-  // Verifies that a simple scan request fails with the specified error code/message.
+  // Verifies that the given scan request fails with the specified error
+  // code/message.
+  void VerifyScanRequestFailure(const ScanRequestPB& req,
+                                TabletServerErrorPB::Code expected_code,
+                                const char *expected_message);
+
+  // Verifies that a simple scan request with the given projection fails with
+  // the specified error code/message.
   void VerifyScanRequestFailure(const Schema& projection,
                                 TabletServerErrorPB::Code expected_code,
                                 const char *expected_message);
diff --git a/src/kudu/tserver/tablet_server-test.cc b/src/kudu/tserver/tablet_server-test.cc
index b720929..a4604a3 100644
--- a/src/kudu/tserver/tablet_server-test.cc
+++ b/src/kudu/tserver/tablet_server-test.cc
@@ -2471,6 +2471,19 @@ TEST_F(TabletServerTest, TestInvalidScanRequest_BadProjectionTypes)
{
                            "NULLABLE found INT32 NULLABLE");
 }
 
+TEST_F(TabletServerTest, TestInvalidScanRequest_UnknownOrderMode) {
+  NO_FATALS(InsertTestRowsDirect(0, 10));
+  ScanRequestPB req;
+  NewScanRequestPB* scan = req.mutable_new_scan_request();
+  scan->set_tablet_id(kTabletId);
+  scan->set_order_mode(OrderMode::UNKNOWN_ORDER_MODE);
+  ASSERT_OK(SchemaToColumnPBs(schema_, scan->mutable_projected_columns()));
+  req.set_call_seq_id(0);
+  NO_FATALS(VerifyScanRequestFailure(req,
+                                     TabletServerErrorPB::INVALID_SCAN_SPEC,
+                                     "Unknown order mode specified"));
+}
+
 // Test that passing a projection with Column IDs throws an exception.
 // Column IDs are assigned to the user request schema on the tablet server
 // based on the latest schema.
diff --git a/src/kudu/tserver/tablet_service.cc b/src/kudu/tserver/tablet_service.cc
index 916c4c5..e710e37 100644
--- a/src/kudu/tserver/tablet_service.cc
+++ b/src/kudu/tserver/tablet_service.cc
@@ -2220,11 +2220,16 @@ Status TabletServiceImpl::HandleNewScanRequest(TabletReplica* replica,
     return Status::InvalidArgument("User requests should not have Column IDs");
   }
 
+  if (scan_pb.order_mode() == UNKNOWN_ORDER_MODE) {
+    *error_code = TabletServerErrorPB::INVALID_SCAN_SPEC;
+    return Status::InvalidArgument("Unknown order mode specified");
+  }
+
   if (scan_pb.order_mode() == ORDERED) {
     // Ordered scans must be at a snapshot so that we perform a serializable read (which
can be
     // resumed). Otherwise, this would be read committed isolation, which is not resumable.
     if (scan_pb.read_mode() != READ_AT_SNAPSHOT) {
-      *error_code = TabletServerErrorPB::INVALID_SNAPSHOT;
+      *error_code = TabletServerErrorPB::INVALID_SCAN_SPEC;
           return Status::InvalidArgument("Cannot do an ordered scan that is not a snapshot
read");
     }
   }
@@ -2268,8 +2273,6 @@ Status TabletServiceImpl::HandleNewScanRequest(TabletReplica* replica,
   projection = projection_builder.BuildWithoutIds();
 
   unique_ptr<RowwiseIterator> iter;
-  // Preset the error code for when creating the iterator on the tablet fails
-  TabletServerErrorPB::Code tmp_error_code = TabletServerErrorPB::MISMATCHED_SCHEMA;
 
   // It's important to keep the reference to the tablet for the case when the
   // tablet replica's shutdown is run concurrently with the code below.
@@ -2293,8 +2296,7 @@ Status TabletServiceImpl::HandleNewScanRequest(TabletReplica* replica,
     switch (scan_pb.read_mode()) {
       case UNKNOWN_READ_MODE: {
         *error_code = TabletServerErrorPB::INVALID_SCAN_SPEC;
-        s = Status::NotSupported("Unknown read mode.");
-        return s;
+        return Status::NotSupported("Unknown read mode.");
       }
       case READ_LATEST: {
         s = tablet->NewRowIterator(projection, &iter);
@@ -2304,16 +2306,8 @@ Status TabletServiceImpl::HandleNewScanRequest(TabletReplica* replica,
       case READ_AT_SNAPSHOT: {
         scoped_refptr<consensus::TimeManager> time_manager = replica->time_manager();
         s = HandleScanAtSnapshot(scan_pb, rpc_context, projection, tablet.get(),
-                                 time_manager.get(), &iter, snap_timestamp);
-        // If we got a Status::ServiceUnavailable() from HandleScanAtSnapshot() it might
-        // mean we're just behind so let the client try again.
-        if (s.IsServiceUnavailable()) {
-          *error_code = TabletServerErrorPB::THROTTLED;
-          return s;
-        }
-        if (!s.ok()) {
-          tmp_error_code = TabletServerErrorPB::INVALID_SNAPSHOT;
-        }
+                                 time_manager.get(), &iter, snap_timestamp,
+                                 error_code);
         break;
       }
     }
@@ -2329,25 +2323,27 @@ Status TabletServiceImpl::HandleNewScanRequest(TabletReplica* replica,
   if (PREDICT_TRUE(s.ok())) {
     TRACE_EVENT0("tserver", "iter->Init");
     s = iter->Init(spec.get());
+    if (PREDICT_FALSE(s.IsInvalidArgument())) {
+      // Tablet::Iterator::Init() returns InvalidArgument when an invalid
+      // projection is specified.
+      // TODO(todd): would be nice if we threaded these more specific
+      // error codes throughout Kudu.
+      *error_code = TabletServerErrorPB::MISMATCHED_SCHEMA;
+      return s;
+    }
   }
 
   TRACE("Iterator init: $0", s.ToString());
 
-  if (PREDICT_FALSE(s.IsInvalidArgument())) {
-    // An invalid projection returns InvalidArgument above.
-    // TODO: would be nice if we threaded these more specific
-    // error codes throughout Kudu.
-    *error_code = tmp_error_code;
-    return s;
-  }
   if (PREDICT_FALSE(!s.ok())) {
     LOG(WARNING) << Substitute("Error setting up scanner with request $0: $1",
                                SecureShortDebugString(*req), s.ToString());
     // If the replica has been stopped, e.g. due to disk failure, return
     // TABLET_FAILED so the scan can be handled appropriately (fail over to
     // another tablet server if fault-tolerant).
-    *error_code = tablet->HasBeenStopped() ?
-        TabletServerErrorPB::TABLET_FAILED : TabletServerErrorPB::UNKNOWN_ERROR;
+    if (tablet->HasBeenStopped()) {
+      *error_code = TabletServerErrorPB::TABLET_FAILED;
+    }
     return s;
   }
 
@@ -2373,11 +2369,8 @@ Status TabletServiceImpl::HandleNewScanRequest(TabletReplica* replica,
   // error if the snapshot timestamp was prior to the ancient history mark.
   // We have to check after we open the iterator in order to avoid a TOCTOU
   // error.
-  s = VerifyNotAncientHistory(tablet.get(), scan_pb.read_mode(), *snap_timestamp);
-  if (!s.ok()) {
-    *error_code = TabletServerErrorPB::INVALID_SNAPSHOT;
-    return s;
-  }
+  RETURN_NOT_OK_EVAL(VerifyNotAncientHistory(tablet.get(), scan_pb.read_mode(), *snap_timestamp),
+                     *error_code = TabletServerErrorPB::INVALID_SNAPSHOT);
 
   *has_more_results = iter->HasNext() && !scanner->has_fulfilled_limit();
   TRACE("has_more: $0", *has_more_results);
@@ -2607,7 +2600,8 @@ Status TabletServiceImpl::HandleScanAtSnapshot(const NewScanRequestPB&
scan_pb,
                                                Tablet* tablet,
                                                consensus::TimeManager* time_manager,
                                                unique_ptr<RowwiseIterator>* iter,
-                                               Timestamp* snap_timestamp) {
+                                               Timestamp* snap_timestamp,
+                                               TabletServerErrorPB::Code* error_code) {
   switch (scan_pb.read_mode()) {
     case READ_AT_SNAPSHOT: // Fallthrough intended
     case READ_YOUR_WRITES:
@@ -2618,7 +2612,11 @@ Status TabletServiceImpl::HandleScanAtSnapshot(const NewScanRequestPB&
scan_pb,
 
   // Based on the read mode, pick a timestamp and verify it.
   Timestamp tmp_snap_timestamp;
-  RETURN_NOT_OK(PickAndVerifyTimestamp(scan_pb, tablet, &tmp_snap_timestamp));
+  Status s = PickAndVerifyTimestamp(scan_pb, tablet, &tmp_snap_timestamp);
+  if (PREDICT_FALSE(!s.ok())) {
+    *error_code = TabletServerErrorPB::INVALID_SNAPSHOT;
+    return s.CloneAndPrepend("cannot verify timestamp");
+  }
 
   // Reduce the client's deadline by a few msecs to allow for overhead.
   MonoTime client_deadline = rpc_context->GetClientDeadline() - MonoDelta::FromMilliseconds(10);
@@ -2627,8 +2625,9 @@ Status TabletServiceImpl::HandleScanAtSnapshot(const NewScanRequestPB&
scan_pb,
   // server will have one less available thread and the client might be stuck spending all
   // of the allotted time for the scan on a partitioned server that will never have a consistent
   // snapshot at 'snap_timestamp'.
-  // Because of this we clamp the client's deadline to the max. configured. If the client
-  // sets a long timeout then it can use it by trying in other servers.
+  // Because of this we clamp the client's deadline to the maximum configured
+  // scanner wait time. If the client sets a longer timeout then it can use it
+  // by retrying (possibly on other servers).
   bool was_clamped = false;
   MonoTime final_deadline = ClampScanDeadlineForWait(client_deadline, &was_clamped);
 
@@ -2638,19 +2637,20 @@ Status TabletServiceImpl::HandleScanAtSnapshot(const NewScanRequestPB&
scan_pb,
   // the same timestamp (repeatable reads).
   TRACE("Waiting safe time to advance");
   MonoTime before = MonoTime::Now();
-  Status s = time_manager->WaitUntilSafe(tmp_snap_timestamp, final_deadline);
+  s = time_manager->WaitUntilSafe(tmp_snap_timestamp, final_deadline);
 
   tablet::MvccSnapshot snap;
-  tablet::MvccManager* mvcc_manager = tablet->mvcc_manager();
-  if (s.ok()) {
+  if (PREDICT_TRUE(s.ok())) {
     // Wait for the in-flights in the snapshot to be finished.
     TRACE("Waiting for operations to commit");
-    s = mvcc_manager->WaitForSnapshotWithAllCommitted(tmp_snap_timestamp, &snap, client_deadline);
+    s = tablet->mvcc_manager()->WaitForSnapshotWithAllCommitted(
+          tmp_snap_timestamp, &snap, client_deadline);
   }
 
   // If we got an TimeOut but we had clamped the deadline, return a ServiceUnavailable instead
   // so that the client retries.
-  if (s.IsTimedOut() && was_clamped) {
+  if (PREDICT_FALSE(s.IsTimedOut() && was_clamped)) {
+    *error_code = TabletServerErrorPB::THROTTLED;
     return Status::ServiceUnavailable(s.CloneAndPrepend(
         "could not wait for desired snapshot timestamp to be consistent").ToString());
   }
@@ -2660,9 +2660,6 @@ Status TabletServiceImpl::HandleScanAtSnapshot(const NewScanRequestPB&
scan_pb,
   tablet->metrics()->snapshot_read_inflight_wait_duration->Increment(duration_usec);
   TRACE("All operations in snapshot committed. Waited for $0 microseconds", duration_usec);
 
-  if (scan_pb.order_mode() == UNKNOWN_ORDER_MODE) {
-    return Status::InvalidArgument("Unknown order mode specified");
-  }
   tablet::RowIteratorOptions opts;
   opts.projection = &projection;
   opts.snap_to_include = snap;
diff --git a/src/kudu/tserver/tablet_service.h b/src/kudu/tserver/tablet_service.h
index bc4c2b0..c3e6d27 100644
--- a/src/kudu/tserver/tablet_service.h
+++ b/src/kudu/tserver/tablet_service.h
@@ -157,7 +157,8 @@ class TabletServiceImpl : public TabletServerServiceIf {
                               tablet::Tablet* tablet,
                               consensus::TimeManager* time_manager,
                               std::unique_ptr<RowwiseIterator>* iter,
-                              Timestamp* snap_timestamp);
+                              Timestamp* snap_timestamp,
+                              TabletServerErrorPB::Code* error_code);
 
   // Validates the given timestamp is not so far in the future that
   // it exceeds the maximum allowed clock synchronization error time,
diff --git a/src/kudu/tserver/tserver.proto b/src/kudu/tserver/tserver.proto
index 3964fdb..9d7b50e 100644
--- a/src/kudu/tserver/tserver.proto
+++ b/src/kudu/tserver/tserver.proto
@@ -56,8 +56,9 @@ message TabletServerErrorPB {
     // created or has expired.
     SCANNER_EXPIRED = 7;
 
-    // An invalid scan was specified -- e.g the values passed for
-    // predicates were incorrect sizes.
+    // An invalid scan was specified. Examples of invalid scan requests include
+    // invalid or unimplemented combinations of scan options and incorrect
+    // sizes for values passed as predicates.
     INVALID_SCAN_SPEC = 8;
 
     // The provided configuration was not well-formed and/or


Mime
View raw message