impala-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From k...@apache.org
Subject [1/2] impala git commit: IMPALA-6285: Don't print stack trace on RPC errors.
Date Fri, 08 Dec 2017 08:06:51 GMT
Repository: impala
Updated Branches:
  refs/heads/master f3fa3e017 -> 542dc227c


IMPALA-6285: Don't print stack trace on RPC errors.

There is not much benefit in printing the stack trace when
Thrift RPC hits an error. As long as we print enough info about
the error and identify the caller, that should be sufficient.
In fact, it has been observed that stack crawl caused unnecessary
CPU spikes in the past. This change replaces Status() with
Status::Expected() in DoRpc(), RetryRpc(), RetryRpcRecv() and
Coordinator::BackendState::Exec() to avoid unnecessary stack crawls.

Testing done: private core build. Verified error strings with
test_rpc_timeout.py and test_rpc_exception.py

Change-Id: Ia83294494442ef21f7934f92ba9112e80d81fa58
Reviewed-on: http://gerrit.cloudera.org:8080/8788
Reviewed-by: Michael Ho <kwho@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/d60eb192
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/d60eb192
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/d60eb192

Branch: refs/heads/master
Commit: d60eb192a959afd5e1a7062b360ade2ef8a8f4f4
Parents: f3fa3e0
Author: Michael Ho <kwho@cloudera.com>
Authored: Thu Dec 7 00:21:19 2017 -0800
Committer: Impala Public Jenkins <impala-public-jenkins@gerrit.cloudera.org>
Committed: Thu Dec 7 23:58:40 2017 +0000

----------------------------------------------------------------------
 be/src/runtime/client-cache.h               | 43 ++++++++++++++++--------
 be/src/runtime/coordinator-backend-state.cc |  2 +-
 common/thrift/generate_error_codes.py       |  2 +-
 3 files changed, 31 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/d60eb192/be/src/runtime/client-cache.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/client-cache.h b/be/src/runtime/client-cache.h
index d0fd30e..500ad59 100644
--- a/be/src/runtime/client-cache.h
+++ b/be/src/runtime/client-cache.h
@@ -240,8 +240,7 @@ class ClientConnection {
       (client_->*f)(*response, request, &send_done);
     } catch (const apache::thrift::transport::TTransportException& e) {
       if (send_done && IsRecvTimeoutTException(e)) {
-        return Status(TErrorCode::RPC_RECV_TIMEOUT, strings::Substitute(
-            "Client $0 timed-out during recv call.", TNetworkAddressToString(address_)));
+        return RecvTimeoutStatus(typeid(*response).name());
       }
 
       // Client may have unexpectedly been closed, so re-open and retry once if we didn't
@@ -254,9 +253,9 @@ class ClientConnection {
       }
 
       // Payload was sent and failure wasn't a timeout waiting for response. Fail the RPC.
-      return Status(TErrorCode::RPC_GENERAL_ERROR, ExceptionMsg(e, send_done));
+      return GeneralErrorStatus(e, typeid(*response).name(), send_done);
     } catch (const apache::thrift::TException& e) {
-      return Status(TErrorCode::RPC_GENERAL_ERROR, ExceptionMsg(e, send_done));
+      return GeneralErrorStatus(e, typeid(*response).name(), send_done);
     }
     client_is_unrecoverable_ = false;
     return Status::OK();
@@ -273,13 +272,12 @@ class ClientConnection {
       (client_->*recv_func)(*response);
     } catch (const apache::thrift::transport::TTransportException& e) {
       if (IsRecvTimeoutTException(e)) {
-        return Status(TErrorCode::RPC_RECV_TIMEOUT, strings::Substitute(
-            "Client $0 timed-out during recv call.", TNetworkAddressToString(address_)));
+        return RecvTimeoutStatus(typeid(*response).name());
       }
       // If it's not timeout exception, then the connection is broken, stop retrying.
-      return Status(TErrorCode::RPC_GENERAL_ERROR, e.what());
+      return GeneralErrorStatus(e, typeid(*response).name(), true);
     } catch (const apache::thrift::TException& e) {
-      return Status(TErrorCode::RPC_GENERAL_ERROR, e.what());
+      return GeneralErrorStatus(e, typeid(*response).name(), true);
     }
     client_is_unrecoverable_ = false;
     return Status::OK();
@@ -295,10 +293,25 @@ class ClientConnection {
   /// recovered.
   bool client_is_unrecoverable_;
 
-  std::string ExceptionMsg(const apache::thrift::TException& e, bool send_done) {
-    return strings::Substitute("Client for $0 hits an unexpected exception: $1, type: $2"
-        " rpc send completed: $3", TNetworkAddressToString(address_), e.what(),
-        typeid(e).name(), send_done ? "true" : "false");
+  /// Returns an error Status for general RPC errors not due to recv timeout or
+  /// stale connections. Will also log some details about the error.
+  Status GeneralErrorStatus(
+      const apache::thrift::TException& e, const std::string& rpc_name, bool send_done)
{
+    ErrorMsg msg(TErrorCode::RPC_GENERAL_ERROR, strings::Substitute("Client for $0 hit "
+        "an unexpected exception: $1, type: $2, rpc: $3, send: $4",
+        TNetworkAddressToString(address_), e.what(), typeid(e).name(), rpc_name,
+        send_done ? "done" : "not done"));
+    VLOG_QUERY << msg.msg();
+    return Status::Expected(msg);
+  }
+
+  /// Returns an error Status for RPC errors due to recv timeout.
+  /// Will also log some details about the error.
+  Status RecvTimeoutStatus(const std::string& rpc_name) {
+    ErrorMsg msg(TErrorCode::RPC_RECV_TIMEOUT, TNetworkAddressToString(address_),
+        rpc_name);
+    VLOG_QUERY << msg.msg();
+    return Status::Expected(msg);
   }
 
   /// Retry the RPC if TCP connection underpinning this client has been closed
@@ -312,7 +325,9 @@ class ClientConnection {
     // TODO: ThriftClient should return proper error codes.
     Status status = Reopen();
     if (!status.ok()) {
-      return Status(TErrorCode::RPC_CLIENT_CONNECT_FAILURE, status.GetDetail());
+      ErrorMsg msg(TErrorCode::RPC_CLIENT_CONNECT_FAILURE, status.GetDetail());
+      VLOG_QUERY << msg.msg() << " rpc: " << typeid(*response).name();
+      return Status::Expected(msg);
     }
     bool send_done = false;
     try {
@@ -321,7 +336,7 @@ class ClientConnection {
       // By this point the RPC really has failed.
       // TODO: Revisit this logic later. It's possible that the new connection
       // works but we hit timeout here.
-      return Status(TErrorCode::RPC_GENERAL_ERROR, ExceptionMsg(e, send_done));
+      return GeneralErrorStatus(e, typeid(*response).name(), send_done);
     }
     client_is_unrecoverable_ = false;
     return Status::OK();

http://git-wip-us.apache.org/repos/asf/impala/blob/d60eb192/be/src/runtime/coordinator-backend-state.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/coordinator-backend-state.cc b/be/src/runtime/coordinator-backend-state.cc
index 973aa25..9d89086 100644
--- a/be/src/runtime/coordinator-backend-state.cc
+++ b/be/src/runtime/coordinator-backend-state.cc
@@ -183,7 +183,7 @@ void Coordinator::BackendState::Exec(
     const string& err_msg =
         Substitute(ERR_TEMPLATE, PrintId(query_id_), rpc_status.msg().msg());
     VLOG_QUERY << err_msg;
-    status_ = Status(err_msg);
+    status_ = Status::Expected(err_msg);
     return;
   }
 

http://git-wip-us.apache.org/repos/asf/impala/blob/d60eb192/common/thrift/generate_error_codes.py
----------------------------------------------------------------------
diff --git a/common/thrift/generate_error_codes.py b/common/thrift/generate_error_codes.py
index 40739e0..5aec1ce 100755
--- a/common/thrift/generate_error_codes.py
+++ b/common/thrift/generate_error_codes.py
@@ -114,7 +114,7 @@ error_codes = (
    "Verify that all your impalads are the same version."),
 
   ("RPC_GENERAL_ERROR", 30, "RPC Error: $0"),
-  ("RPC_RECV_TIMEOUT", 31, "RPC recv timed out: $0"),
+  ("RPC_RECV_TIMEOUT", 31, "RPC recv timed out: dest address: $0, rpc: $1"),
 
   ("UDF_VERIFY_FAILED", 32,
    "Failed to verify function $0 from LLVM module $1, see log for more details."),


Mime
View raw message