From commits-return-5428-archive-asf-public=cust-asf.ponee.io@kudu.apache.org Thu Mar 15 18:23:44 2018 Return-Path: X-Original-To: archive-asf-public@cust-asf.ponee.io Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by mx-eu-01.ponee.io (Postfix) with SMTP id F2FF2180654 for ; Thu, 15 Mar 2018 18:23:42 +0100 (CET) Received: (qmail 48929 invoked by uid 500); 15 Mar 2018 17:23:42 -0000 Mailing-List: contact commits-help@kudu.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@kudu.apache.org Delivered-To: mailing list commits@kudu.apache.org Received: (qmail 48920 invoked by uid 99); 15 Mar 2018 17:23:42 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 15 Mar 2018 17:23:42 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 78052F6632; Thu, 15 Mar 2018 17:23:41 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: todd@apache.org To: commits@kudu.apache.org Date: Thu, 15 Mar 2018 17:23:42 -0000 Message-Id: <0a879e8c057d4db999de363cf38150c1@git.apache.org> In-Reply-To: <65f89d753f0d4d3e8216477b2ad75ba2@git.apache.org> References: <65f89d753f0d4d3e8216477b2ad75ba2@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [2/2] kudu git commit: java: reduce scary exception spew and improve logging java: reduce scary exception spew and improve logging In real environments there are various types of failures that are "expected". In particular, it's not unlikely for the client to occasionally connect to a server which has gone down or which is not the leader for a given tablet anymore. In that case, we should make sure that the logging output is easy to read and doesn't include irrelevant stack traces stemming from Netty internals. This patch addresses several of those cases. In particular, it gets rid of "unexpected exception" messages (and associated stacks) in the case of "connection refused" errors. It also helpfully includes the underlying error when invalidating the tablet cache for a location. There's a relatively simple new unit test, and I also ran some before/after comparisons using the "loadgen" sample from the kudu-examples repo against a local cluster with one TS: Case 1: Master down when we start the client: ============================================================ Before the patch -------------- [New I/O worker #12] WARN org.apache.kudu.client.ConnectToCluster - Error receiving response from localhost:7051 org.apache.kudu.client.RecoverableException: connection closed at org.apache.kudu.client.Connection.channelClosed(Connection.java:254) at org.apache.kudu.shaded.org.jboss.netty.channel.SimpleChannelUpstreamHandler.handleUpstream(SimpleChannelUpstreamHandler.java:88) at org.apache.kudu.client.Connection.handleUpstream(Connection.java:238) at org.apache.kudu.shaded.org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564) at org.apache.kudu.shaded.org.jboss.netty.channel.DefaultChannelPipeline$DefaultChannelHandlerContext.sendUpstream(DefaultChannelPipeline.java:791) at org.apache.kudu.shaded.org.jboss.netty.handler.timeout.ReadTimeoutHandler.channelClosed(ReadTimeoutHandler.java:176) at org.apache.kudu.shaded.org.jboss.netty.channel.SimpleChannelUpstreamHandler.handleUpstream(SimpleChannelUpstreamHandler.java:88) at org.apache.kudu.shaded.org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564) at org.apache.kudu.shaded.org.jboss.netty.channel.DefaultChannelPipeline$DefaultChannelHandlerContext.sendUpstream(DefaultChannelPipeline.java:791) at org.apache.kudu.shaded.org.jboss.netty.handler.codec.oneone.OneToOneDecoder.handleUpstream(OneToOneDecoder.java:60) at org.apache.kudu.shaded.org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564) at org.apache.kudu.shaded.org.jboss.netty.channel.DefaultChannelPipeline$DefaultChannelHandlerContext.sendUpstream(DefaultChannelPipeline.java:791) at org.apache.kudu.shaded.org.jboss.netty.handler.codec.frame.FrameDecoder.cleanup(FrameDecoder.java:493) at org.apache.kudu.shaded.org.jboss.netty.handler.codec.frame.FrameDecoder.channelClosed(FrameDecoder.java:371) at org.apache.kudu.shaded.org.jboss.netty.channel.SimpleChannelUpstreamHandler.handleUpstream(SimpleChannelUpstreamHandler.java:88) at org.apache.kudu.shaded.org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564) at org.apache.kudu.shaded.org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:559) at org.apache.kudu.shaded.org.jboss.netty.channel.Channels.fireChannelClosed(Channels.java:468) at org.apache.kudu.shaded.org.jboss.netty.channel.Channels$6.run(Channels.java:457) at org.apache.kudu.shaded.org.jboss.netty.channel.socket.ChannelRunnableWrapper.run(ChannelRunnableWrapper.java:40) at org.apache.kudu.shaded.org.jboss.netty.channel.socket.nio.AbstractNioSelector.processTaskQueue(AbstractNioSelector.java:391) at org.apache.kudu.shaded.org.jboss.netty.channel.socket.nio.AbstractNioSelector.run(AbstractNioSelector.java:315) at org.apache.kudu.shaded.org.jboss.netty.channel.socket.nio.AbstractNioWorker.run(AbstractNioWorker.java:89) at org.apache.kudu.shaded.org.jboss.netty.channel.socket.nio.NioWorker.run(NioWorker.java:178) at org.apache.kudu.shaded.org.jboss.netty.util.ThreadRenamingRunnable.run(ThreadRenamingRunnable.java:108) at org.apache.kudu.shaded.org.jboss.netty.util.internal.DeadLockProofWorker$1.run(DeadLockProofWorker.java:42) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) [New I/O boss #17] ERROR org.apache.kudu.client.Connection - [peer master-localhost:7051(localhost:7051)] unexpected exception from downstream on [id: 0xf9759051] java.net.ConnectException: Connection refused: localhost/127.0.0.1:7051 at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method) at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:717) at org.apache.kudu.shaded.org.jboss.netty.channel.socket.nio.NioClientBoss.connect(NioClientBoss.java:152) at org.apache.kudu.shaded.org.jboss.netty.channel.socket.nio.NioClientBoss.processSelectedKeys(NioClientBoss.java:105) at org.apache.kudu.shaded.org.jboss.netty.channel.socket.nio.NioClientBoss.process(NioClientBoss.java:79) at org.apache.kudu.shaded.org.jboss.netty.channel.socket.nio.AbstractNioSelector.run(AbstractNioSelector.java:337) at org.apache.kudu.shaded.org.jboss.netty.channel.socket.nio.NioClientBoss.run(NioClientBoss.java:42) at org.apache.kudu.shaded.org.jboss.netty.util.ThreadRenamingRunnable.run(ThreadRenamingRunnable.java:108) at org.apache.kudu.shaded.org.jboss.netty.util.internal.DeadLockProofWorker$1.run(DeadLockProofWorker.java:42) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) [New I/O worker #12] WARN org.apache.kudu.client.ConnectToCluster - Unable to find the leader master localhost:7051; will retry ... repeats with exponential backoff After the patch -------------- [New I/O boss #17] INFO org.apache.kudu.client.Connection - Failed to connect to peer master-localhost:7051(localhost:7051): Connection refused: localhost/127.0.0.1:7051 [New I/O worker #1] INFO org.apache.kudu.client.ConnectToCluster - Unable to connect to master localhost:7051: Connection refused: localhost/127.0.0.1:7051 [New I/O worker #1] WARN org.apache.kudu.client.ConnectToCluster - Unable to find the leader master localhost:7051; will retry ... repeats with exponential backoff Case 2: Tserver is down ============================================================ Before the patch ---------------- [New I/O worker #11] INFO org.apache.kudu.client.AsyncKuduClient - Removing server fc9f6ab955cc47a8ab3653d0170305b0 from this tablet's cache 9da164fbcd93404ea8e38e9491eb3fa8 [New I/O boss #17] ERROR org.apache.kudu.client.Connection - [peer fc9f6ab955cc47a8ab3653d0170305b0(todd-laptop:7050)] unexpected exception from downstream on [id: 0x270e3a82] java.net.ConnectException: Connection refused: todd-laptop/127.0.1.1:7050 at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method) at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:717) at org.apache.kudu.shaded.org.jboss.netty.channel.socket.nio.NioClientBoss.connect(NioClientBoss.java:152) at org.apache.kudu.shaded.org.jboss.netty.channel.socket.nio.NioClientBoss.processSelectedKeys(NioClientBoss.java:105) at org.apache.kudu.shaded.org.jboss.netty.channel.socket.nio.NioClientBoss.process(NioClientBoss.java:79) at org.apache.kudu.shaded.org.jboss.netty.channel.socket.nio.AbstractNioSelector.run(AbstractNioSelector.java:337) at org.apache.kudu.shaded.org.jboss.netty.channel.socket.nio.NioClientBoss.run(NioClientBoss.java:42) at org.apache.kudu.shaded.org.jboss.netty.util.ThreadRenamingRunnable.run(ThreadRenamingRunnable.java:108) at org.apache.kudu.shaded.org.jboss.netty.util.internal.DeadLockProofWorker$1.run(DeadLockProofWorker.java:42) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) ... repeats with exponential backoff After the patch --------------- [New I/O boss #17] INFO org.apache.kudu.client.Connection - Failed to connect to peer fc9f6ab955cc47a8ab3653d0170305b0(todd-laptop:7050): Connection refused: todd-laptop/127.0.1.1:7050 [New I/O worker #15] INFO org.apache.kudu.client.AsyncKuduClient - Invalidating location fc9f6ab955cc47a8ab3653d0170305b0(todd-laptop:7050) for tablet 478ddbaafb5b494ea0de8e2d0ed45a00: Connection refused: todd-laptop/127.0.1.1:7050 ... repeats with exponential backoff Case 3: A tablet is bad ============================================================ In this case I manually delete a non-replicated tablet that is being written to using 'kudu remote_replica delete'. This is unrecoverable since there is no other replica. Before the patch ----------------- [New I/O worker #10] INFO org.apache.kudu.client.AsyncKuduClient - Removing server fc9f6ab955cc47a8ab3653d0170305b0 from this tablet's cache a633f4a0c14b4404a8ae1f825b378867 ... repeats with exponential backoff After the patch ---------------- [New I/O worker #9] INFO org.apache.kudu.client.AsyncKuduClient - Invalidating location 36622c46b25f4da9ada43f8591728053(todd-laptop:7050) for tablet c0b9488597e8447595ad0ee0a3378f95: Tablet not RUNNING: STOPPED ... repeats with exponential backoff Change-Id: I4b8be871693fecc1ee46a4e238dd2ed8f0730d4b Reviewed-on: http://gerrit.cloudera.org:8080/9644 Reviewed-by: Grant Henke Tested-by: Grant Henke Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/ead75684 Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/ead75684 Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/ead75684 Branch: refs/heads/master Commit: ead756844ce9ada904fcc3666df25692f63e76b8 Parents: 8204f43 Author: Todd Lipcon Authored: Wed Mar 14 15:33:27 2018 -0700 Committer: Grant Henke Committed: Thu Mar 15 16:14:23 2018 +0000 ---------------------------------------------------------------------- .../org/apache/kudu/client/AsyncKuduClient.java | 8 ++++-- .../apache/kudu/client/ConnectToCluster.java | 2 +- .../java/org/apache/kudu/client/Connection.java | 28 +++++++++++++++++-- .../kudu/client/TestClientFailoverSupport.java | 18 ++++++++++++ .../org/apache/kudu/client/TestKuduClient.java | 29 ++++++++++++++++++++ 5 files changed, 78 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/ead75684/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java ---------------------------------------------------------------------- diff --git a/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java b/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java index 4b93a34..9c52748 100644 --- a/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java +++ b/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java @@ -1643,7 +1643,7 @@ public class AsyncKuduClient implements AutoCloseable { * We're in the context of decode() meaning we need to either callback or retry later. */ void handleTabletNotFound(final KuduRpc rpc, KuduException ex, ServerInfo info) { - invalidateTabletCache(rpc.getTablet(), info); + invalidateTabletCache(rpc.getTablet(), info, ex.getMessage()); handleRetryableError(rpc, ex); } @@ -1732,9 +1732,11 @@ public class AsyncKuduClient implements AutoCloseable { * Remove the tablet server from the RemoteTablet's locations. Right now nothing is removing * the tablet itself from the caches. */ - private void invalidateTabletCache(RemoteTablet tablet, ServerInfo info) { + private void invalidateTabletCache(RemoteTablet tablet, ServerInfo info, + String errorMessage) { final String uuid = info.getUuid(); - LOG.info("Removing server {} from this tablet's cache {}", uuid, tablet.getTabletId()); + LOG.info("Invalidating location {} for tablet {}: {}", + info, tablet.getTabletId(), errorMessage); tablet.removeTabletClient(uuid); } http://git-wip-us.apache.org/repos/asf/kudu/blob/ead75684/java/kudu-client/src/main/java/org/apache/kudu/client/ConnectToCluster.java ---------------------------------------------------------------------- diff --git a/java/kudu-client/src/main/java/org/apache/kudu/client/ConnectToCluster.java b/java/kudu-client/src/main/java/org/apache/kudu/client/ConnectToCluster.java index 3e925c5..911a83d 100644 --- a/java/kudu-client/src/main/java/org/apache/kudu/client/ConnectToCluster.java +++ b/java/kudu-client/src/main/java/org/apache/kudu/client/ConnectToCluster.java @@ -360,7 +360,7 @@ final class ConnectToCluster { @Override public Void call(Exception e) throws Exception { - LOG.warn("Error receiving response from {}", hostAndPort, e); + LOG.info("Unable to connect to master {}: {}", hostAndPort, e.getMessage()); exceptionsReceived.add(e); incrementCountAndCheckExhausted(); return null; http://git-wip-us.apache.org/repos/asf/kudu/blob/ead75684/java/kudu-client/src/main/java/org/apache/kudu/client/Connection.java ---------------------------------------------------------------------- diff --git a/java/kudu-client/src/main/java/org/apache/kudu/client/Connection.java b/java/kudu-client/src/main/java/org/apache/kudu/client/Connection.java index b7513e4..ff04ab0 100644 --- a/java/kudu-client/src/main/java/org/apache/kudu/client/Connection.java +++ b/java/kudu-client/src/main/java/org/apache/kudu/client/Connection.java @@ -17,6 +17,7 @@ package org.apache.kudu.client; +import java.net.ConnectException; import java.nio.channels.ClosedChannelException; import java.util.ArrayList; import java.util.HashMap; @@ -168,6 +169,11 @@ class Connection extends SimpleChannelUpstreamHandler { @GuardedBy("lock") private int nextCallId = 0; + @Nullable + @GuardedBy("lock") + /** The future for the connection attempt. Set only once connect() is called. */ + private ChannelFuture connectFuture; + /** * Create a new Connection object to the specified destination. * @@ -249,9 +255,21 @@ class Connection extends SimpleChannelUpstreamHandler { /** {@inheritDoc} */ @Override public void channelClosed(final ChannelHandlerContext ctx, final ChannelStateEvent e) { + String msg = "connection closed"; + // Connection failures are reported as channelClosed() before exceptionCaught() is called. + // We can detect this case by looking at whether connectFuture has been marked complete + // and grabbing the exception from there. + lock.lock(); + try { + if (connectFuture != null && connectFuture.getCause() != null) { + msg = connectFuture.getCause().getMessage(); + } + } finally { + lock.unlock(); + } // No need to call super.channelClosed(ctx, e) -- there should be nobody in the upstream // pipeline after Connection itself. So, just handle the close event ourselves. - cleanup(new RecoverableException(Status.NetworkError("connection closed"))); + cleanup(new RecoverableException(Status.NetworkError(msg))); } /** {@inheritDoc} */ @@ -415,7 +433,11 @@ class Connection extends SimpleChannelUpstreamHandler { explicitlyDisconnected ? "%s disconnected from peer" : "%s lost connection to peer", getLogPrefix()); error = new RecoverableException(Status.NetworkError(message), e); - LOG.info(message, e); + LOG.info(message); + } else if (e instanceof ConnectException) { + String message = "Failed to connect to peer " + serverInfo + ": " + e.getMessage(); + error = new RecoverableException(Status.NetworkError(message), e); + LOG.info(message); } else if (e instanceof SSLException && explicitlyDisconnected) { // There's a race in Netty where, when we call Channel.close(), it tries // to send a TLS 'shutdown' message and enters a shutdown state. If another @@ -703,7 +725,7 @@ class Connection extends SimpleChannelUpstreamHandler { Preconditions.checkState(lock.isHeldByCurrentThread()); Preconditions.checkState(state == State.NEW); state = State.CONNECTING; - channel.connect(serverInfo.getResolvedAddress()); + connectFuture = channel.connect(serverInfo.getResolvedAddress()); } /** Enumeration to represent the internal state of the Connection object. */ http://git-wip-us.apache.org/repos/asf/kudu/blob/ead75684/java/kudu-client/src/test/java/org/apache/kudu/client/TestClientFailoverSupport.java ---------------------------------------------------------------------- diff --git a/java/kudu-client/src/test/java/org/apache/kudu/client/TestClientFailoverSupport.java b/java/kudu-client/src/test/java/org/apache/kudu/client/TestClientFailoverSupport.java index 6f8f51f..31e8b77 100644 --- a/java/kudu-client/src/test/java/org/apache/kudu/client/TestClientFailoverSupport.java +++ b/java/kudu-client/src/test/java/org/apache/kudu/client/TestClientFailoverSupport.java @@ -20,16 +20,21 @@ import static org.apache.kudu.util.AssertHelpers.assertEventuallyTrue; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; +import java.io.Closeable; import java.io.IOException; import java.util.List; import org.apache.kudu.util.AssertHelpers.BooleanExpression; +import org.apache.kudu.util.CapturingLogAppender; import org.junit.After; +import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; import com.google.common.net.HostAndPort; public class TestClientFailoverSupport extends BaseKuduTest { + private CapturingLogAppender cla = new CapturingLogAppender(); + private Closeable claAttach; enum MasterFailureType { RESTART, @@ -42,12 +47,25 @@ public class TestClientFailoverSupport extends BaseKuduTest { BaseKuduTest.doSetup(3, NUM_TABLET_SERVERS); } + @Before + public void attachToLog() { + claAttach = cla.attach(); + } + @After public void restartKilledMaster() throws IOException { miniCluster.restartDeadMasters(); miniCluster.restartDeadTservers(); } + @After + public void checkLogs() throws IOException { + claAttach.close(); + String log = cla.getAppendedText(); + assertFalse("Log should not contain Netty internals", + log.contains("socket.nio.AbstractNioSelector")); + } + private void waitUntilRowCount(final KuduTable table, final int rowCount, long timeoutMs) throws Exception { assertEventuallyTrue(String.format("Read count should be %s", rowCount), http://git-wip-us.apache.org/repos/asf/kudu/blob/ead75684/java/kudu-client/src/test/java/org/apache/kudu/client/TestKuduClient.java ---------------------------------------------------------------------- diff --git a/java/kudu-client/src/test/java/org/apache/kudu/client/TestKuduClient.java b/java/kudu-client/src/test/java/org/apache/kudu/client/TestKuduClient.java index 52e8fdb..d527bd0 100644 --- a/java/kudu-client/src/test/java/org/apache/kudu/client/TestKuduClient.java +++ b/java/kudu-client/src/test/java/org/apache/kudu/client/TestKuduClient.java @@ -906,6 +906,35 @@ public class TestKuduClient extends BaseKuduTest { exception_text.contains("lost connection to peer")); } + /** + * Test that, if the masters are down when we attempt to connect, we don't end up + * logging any nonsensical stack traces including Netty internals. + */ + @Test(timeout = 100000) + public void testNoLogSpewOnConnectionRefused() throws Exception { + CapturingLogAppender cla = new CapturingLogAppender(); + try (Closeable c = cla.attach()) { + miniCluster.killMasters(); + try (KuduClient localClient = new KuduClient.KuduClientBuilder(masterAddresses).build()) { + // Force the client to connect to the masters. + localClient.exportAuthenticationCredentials(); + } catch (NoLeaderFoundException e) { + assertTrue("Bad exception string: " + e.getMessage(), + e.getMessage().matches(".*Master config .+ has no leader. " + + "Exceptions received:.*Connection refused.*Connection refused" + + ".*Connection refused.*")); + } + } finally { + miniCluster.restartDeadMasters(); + } + // Ensure there is no log spew due to an unexpected lost connection. + String logText = cla.getAppendedText(); + assertFalse("Should not claim to have lost a connection in the log", + logText.contains("lost connection to peer")); + assertFalse("Should not have netty spew in log", + logText.contains("socket.nio.AbstractNioSelector")); + } + @Test(timeout = 100000) public void testCustomNioExecutor() throws Exception { long startTime = System.nanoTime();