From commits-return-7887-archive-asf-public=cust-asf.ponee.io@zookeeper.apache.org Mon Jul 29 09:49:41 2019 Return-Path: X-Original-To: archive-asf-public@cust-asf.ponee.io Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [207.244.88.153]) by mx-eu-01.ponee.io (Postfix) with SMTP id 353A118063F for ; Mon, 29 Jul 2019 11:49:39 +0200 (CEST) Received: (qmail 29514 invoked by uid 500); 29 Jul 2019 09:49:35 -0000 Mailing-List: contact commits-help@zookeeper.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@zookeeper.apache.org Delivered-To: mailing list commits@zookeeper.apache.org Received: (qmail 29498 invoked by uid 99); 29 Jul 2019 09:49:35 -0000 Received: from ec2-52-202-80-70.compute-1.amazonaws.com (HELO gitbox.apache.org) (52.202.80.70) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 29 Jul 2019 09:49:35 +0000 Received: by gitbox.apache.org (ASF Mail Server at gitbox.apache.org, from userid 33) id B168385E9D; Mon, 29 Jul 2019 09:49:35 +0000 (UTC) Date: Mon, 29 Jul 2019 09:49:35 +0000 To: "commits@zookeeper.apache.org" Subject: [zookeeper] branch master updated: ZOOKEEPER-3320: Leader election port stop listen when hostname unresolvable for some time MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit Message-ID: <156439377558.25990.3852191421109733768@gitbox.apache.org> From: nkalmar@apache.org X-Git-Host: gitbox.apache.org X-Git-Repo: zookeeper X-Git-Refname: refs/heads/master X-Git-Reftype: branch X-Git-Oldrev: 83940b18ce70016de7400f6159ce2a2db5a0bbec X-Git-Newrev: 05ee9413e7a31703395b81fb8d72baf1cb09a46d X-Git-Rev: 05ee9413e7a31703395b81fb8d72baf1cb09a46d X-Git-NotificationType: ref_changed_plus_diff X-Git-Multimail-Version: 1.5.dev Auto-Submitted: auto-generated This is an automated email from the ASF dual-hosted git repository. nkalmar pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/zookeeper.git The following commit(s) were added to refs/heads/master by this push: new 05ee941 ZOOKEEPER-3320: Leader election port stop listen when hostname unresolvable for some time 05ee941 is described below commit 05ee9413e7a31703395b81fb8d72baf1cb09a46d Author: Igor Skokov AuthorDate: Mon Jul 29 11:49:27 2019 +0200 ZOOKEEPER-3320: Leader election port stop listen when hostname unresolvable for some time Author: Igor Skokov Author: Igor Skokov Reviewers: Enrico Olivelli , Norbert Kalmar Closes #863 from Lagrang/ZOOKEEPER-3320 --- .../src/main/resources/markdown/zookeeperAdmin.md | 12 ++++++ .../zookeeper/server/quorum/QuorumCnxManager.java | 46 ++++++++++++++++------ .../zookeeper/server/quorum/CnxManagerTest.java | 29 ++++++++++++++ 3 files changed, 74 insertions(+), 13 deletions(-) diff --git a/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md b/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md index 1690ce6..d38afd2 100644 --- a/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md +++ b/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md @@ -1076,6 +1076,18 @@ As an example, this will enable all four letter word commands: properly, check your operating system's options regarding TCP keepalive for more information. Defaults to **false**. + +* *zookeeper.electionPortBindRetry* : + (Java system property only: **zookeeper.electionPortBindRetry**) + Property set max retry count when Zookeeper server fails to bind + leader election port. Such errors can be temporary and recoverable, + such as DNS issue described in [ZOOKEEPER-3320](https://issues.apache.org/jira/projects/ZOOKEEPER/issues/ZOOKEEPER-3320), + or non-retryable, such as port already in use. + In case of transient errors, this property can improve availability + of Zookeeper server and help it to self recover. + Default value 3. In container environment, especially in Kubernetes, + this value should be increased to overcome issues related to DNS name resolving. + * *observer.reconnectDelayMs* : (Java system property: **zookeeper.observer.reconnectDelayMs**) diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumCnxManager.java b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumCnxManager.java index d97da2a..4be8fa6 100644 --- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumCnxManager.java +++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumCnxManager.java @@ -18,6 +18,8 @@ package org.apache.zookeeper.server.quorum; +import static org.apache.zookeeper.common.NetUtils.formatInetAddr; + import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.DataInputStream; @@ -36,6 +38,7 @@ import java.util.Collections; import java.util.Enumeration; import java.util.HashSet; import java.util.Map; +import java.util.NoSuchElementException; import java.util.Set; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.ConcurrentHashMap; @@ -43,24 +46,20 @@ import java.util.concurrent.SynchronousQueue; import java.util.concurrent.ThreadFactory; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; -import java.util.NoSuchElementException; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; - +import javax.net.ssl.SSLSocket; import org.apache.zookeeper.common.X509Exception; import org.apache.zookeeper.server.ExitCode; -import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException; -import org.apache.zookeeper.server.util.ConfigUtils; import org.apache.zookeeper.server.ZooKeeperThread; +import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException; import org.apache.zookeeper.server.quorum.auth.QuorumAuthLearner; import org.apache.zookeeper.server.quorum.auth.QuorumAuthServer; import org.apache.zookeeper.server.quorum.flexible.QuorumVerifier; +import org.apache.zookeeper.server.util.ConfigUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.net.ssl.SSLSocket; -import static org.apache.zookeeper.common.NetUtils.formatInetAddr; - /** * This class implements a connection manager for leader election using TCP. It * maintains one connection for every pair of servers. The tricky part is to @@ -848,12 +847,30 @@ public class QuorumCnxManager { */ public class Listener extends ZooKeeperThread { + private static final String ELECTION_PORT_BIND_RETRY = "zookeeper.electionPortBindRetry"; + private static final int DEFAULT_PORT_BIND_MAX_RETRY = 3; + + private final int portBindMaxRetry; volatile ServerSocket ss = null; public Listener() { // During startup of thread, thread name will be overridden to // specific election address super("ListenerThread"); + + // maximum retry count while trying to bind to election port + // see ZOOKEEPER-3320 for more details + final Integer maxRetry = Integer.getInteger(ELECTION_PORT_BIND_RETRY, + DEFAULT_PORT_BIND_MAX_RETRY); + if (maxRetry >= 0) { + LOG.info("Election port bind maximum retries is {}", maxRetry); + portBindMaxRetry = maxRetry; + } else { + LOG.info("'{}' contains invalid value: {}(must be >= 0). " + + "Use default value of {} instead.", + ELECTION_PORT_BIND_RETRY, maxRetry, DEFAULT_PORT_BIND_MAX_RETRY); + portBindMaxRetry = DEFAULT_PORT_BIND_MAX_RETRY; + } } /** @@ -865,7 +882,7 @@ public class QuorumCnxManager { InetSocketAddress addr; Socket client = null; Exception exitException = null; - while((!shutdown) && (numRetries < 3)){ + while((!shutdown) && (numRetries < portBindMaxRetry)){ try { if (self.shouldUsePortUnification()) { LOG.info("Creating TLS-enabled quorum server socket"); @@ -935,11 +952,14 @@ public class QuorumCnxManager { } LOG.info("Leaving listener"); if (!shutdown) { - LOG.error("As I'm leaving the listener thread, " - + "I won't be able to participate in leader " - + "election any longer: " - + formatInetAddr(self.getElectionAddress())); - if (exitException instanceof BindException) { + LOG.error("As I'm leaving the listener thread after " + + numRetries + " errors. " + + "I won't be able to participate in leader " + + "election any longer: " + + formatInetAddr(self.getElectionAddress()) + + ". Use " + ELECTION_PORT_BIND_RETRY + " property to " + + "increase retry count."); + if (exitException instanceof SocketException) { // After leaving listener thread, the host cannot join the // quorum anymore, this is a severe error that we cannot // recover from, so we need to exit diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CnxManagerTest.java b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CnxManagerTest.java index 878e41b..200ed99 100644 --- a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CnxManagerTest.java +++ b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CnxManagerTest.java @@ -291,6 +291,35 @@ public class CnxManagerTest extends ZKTestCase { } /** + * Test for bug described in {@link https://issues.apache.org/jira/browse/ZOOKEEPER-3320}. + * Test create peer with address which contains unresolvable DNS name, + * leader election listener thread should stop after N errors. + * + * @throws Exception + */ + @Test + public void testCnxManagerListenerThreadConfigurableRetry() throws Exception { + final Map unresolvablePeers = new HashMap<>(); + final long myid = 1L; + unresolvablePeers.put(myid, new QuorumServer(myid, "unresolvable-domain.org:2182:2183;2181")); + final QuorumPeer peer = new QuorumPeer(unresolvablePeers, + ClientBase.createTmpDir(), + ClientBase.createTmpDir(), + 2181, 3, myid, 1000, 2, 2, 2); + final QuorumCnxManager cnxManager = peer.createCnxnManager(); + QuorumCnxManager.Listener listener = cnxManager.listener; + listener.start(); + // listener thread should stop and throws error which notify QuorumPeer about error. + // QuorumPeer should start shutdown process + listener.join(15000); // set wait time, if listener contains bug and thread not stops. + Assert.assertFalse(listener.isAlive()); + Assert.assertFalse(peer.isRunning()); + peer.join(15000); + Assert.assertFalse(QuorumPeer.class.getSimpleName() + " not stopped after " + + "listener thread death", listener.isAlive()); + } + + /** * Tests a bug in QuorumCnxManager that causes a NPE when a 3.4.6 * observer connects to a 3.5.0 server. * see https://issues.apache.org/jira/browse/ZOOKEEPER-1789