zookeeper-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From nkal...@apache.org
Subject [zookeeper] branch master updated: ZOOKEEPER-3320: Leader election port stop listen when hostname unresolvable for some time
Date Mon, 29 Jul 2019 09:49:35 GMT
This is an automated email from the ASF dual-hosted git repository.

nkalmar pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/zookeeper.git


The following commit(s) were added to refs/heads/master by this push:
     new 05ee941  ZOOKEEPER-3320: Leader election port stop listen when hostname unresolvable
for some time
05ee941 is described below

commit 05ee9413e7a31703395b81fb8d72baf1cb09a46d
Author: Igor Skokov <igor.skokov@cinarra.com>
AuthorDate: Mon Jul 29 11:49:27 2019 +0200

    ZOOKEEPER-3320: Leader election port stop listen when hostname unresolvable for some time
    
    Author: Igor Skokov <igor.skokov@cinarra.com>
    Author: Igor Skokov <lagrang09@gmail.com>
    
    Reviewers: Enrico Olivelli <eolivelli@apache.org>, Norbert Kalmar <nkalmar@apache.org>
    
    Closes #863 from Lagrang/ZOOKEEPER-3320
---
 .../src/main/resources/markdown/zookeeperAdmin.md  | 12 ++++++
 .../zookeeper/server/quorum/QuorumCnxManager.java  | 46 ++++++++++++++++------
 .../zookeeper/server/quorum/CnxManagerTest.java    | 29 ++++++++++++++
 3 files changed, 74 insertions(+), 13 deletions(-)

diff --git a/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md b/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md
index 1690ce6..d38afd2 100644
--- a/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md
+++ b/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md
@@ -1076,6 +1076,18 @@ As an example, this will enable all four letter word commands:
     properly, check your operating system's options regarding TCP
     keepalive for more information.  Defaults to
     **false**.
+    
+* *zookeeper.electionPortBindRetry* :
+    (Java system property only: **zookeeper.electionPortBindRetry**)
+    Property set max retry count when Zookeeper server fails to bind 
+    leader election port. Such errors can be temporary and recoverable, 
+    such as DNS issue described in [ZOOKEEPER-3320](https://issues.apache.org/jira/projects/ZOOKEEPER/issues/ZOOKEEPER-3320),
+    or non-retryable, such as port already in use.  
+    In case of transient errors, this property can improve availability 
+    of Zookeeper server and help it to self recover. 
+    Default value 3. In container environment, especially in Kubernetes, 
+    this value should be increased to overcome issues related to DNS name resolving.
+    
 
 * *observer.reconnectDelayMs* :
     (Java system property: **zookeeper.observer.reconnectDelayMs**)
diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumCnxManager.java
b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumCnxManager.java
index d97da2a..4be8fa6 100644
--- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumCnxManager.java
+++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumCnxManager.java
@@ -18,6 +18,8 @@
 
 package org.apache.zookeeper.server.quorum;
 
+import static org.apache.zookeeper.common.NetUtils.formatInetAddr;
+
 import java.io.BufferedInputStream;
 import java.io.BufferedOutputStream;
 import java.io.DataInputStream;
@@ -36,6 +38,7 @@ import java.util.Collections;
 import java.util.Enumeration;
 import java.util.HashSet;
 import java.util.Map;
+import java.util.NoSuchElementException;
 import java.util.Set;
 import java.util.concurrent.ArrayBlockingQueue;
 import java.util.concurrent.ConcurrentHashMap;
@@ -43,24 +46,20 @@ import java.util.concurrent.SynchronousQueue;
 import java.util.concurrent.ThreadFactory;
 import java.util.concurrent.ThreadPoolExecutor;
 import java.util.concurrent.TimeUnit;
-import java.util.NoSuchElementException;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicLong;
-
+import javax.net.ssl.SSLSocket;
 import org.apache.zookeeper.common.X509Exception;
 import org.apache.zookeeper.server.ExitCode;
-import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException;
-import org.apache.zookeeper.server.util.ConfigUtils;
 import org.apache.zookeeper.server.ZooKeeperThread;
+import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException;
 import org.apache.zookeeper.server.quorum.auth.QuorumAuthLearner;
 import org.apache.zookeeper.server.quorum.auth.QuorumAuthServer;
 import org.apache.zookeeper.server.quorum.flexible.QuorumVerifier;
+import org.apache.zookeeper.server.util.ConfigUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import javax.net.ssl.SSLSocket;
-import static org.apache.zookeeper.common.NetUtils.formatInetAddr;
-
 /**
  * This class implements a connection manager for leader election using TCP. It
  * maintains one connection for every pair of servers. The tricky part is to
@@ -848,12 +847,30 @@ public class QuorumCnxManager {
      */
     public class Listener extends ZooKeeperThread {
 
+        private static final String ELECTION_PORT_BIND_RETRY = "zookeeper.electionPortBindRetry";
+        private static final int DEFAULT_PORT_BIND_MAX_RETRY = 3;
+
+        private final int portBindMaxRetry;
         volatile ServerSocket ss = null;
 
         public Listener() {
             // During startup of thread, thread name will be overridden to
             // specific election address
             super("ListenerThread");
+
+            // maximum retry count while trying to bind to election port
+            // see ZOOKEEPER-3320 for more details
+            final Integer maxRetry = Integer.getInteger(ELECTION_PORT_BIND_RETRY,
+                                                        DEFAULT_PORT_BIND_MAX_RETRY);
+            if (maxRetry >= 0) {
+                LOG.info("Election port bind maximum retries is {}", maxRetry);
+                portBindMaxRetry = maxRetry;
+            } else {
+                LOG.info("'{}' contains invalid value: {}(must be >= 0). "
+                         + "Use default value of {} instead.",
+                         ELECTION_PORT_BIND_RETRY, maxRetry, DEFAULT_PORT_BIND_MAX_RETRY);
+                portBindMaxRetry = DEFAULT_PORT_BIND_MAX_RETRY;
+            }
         }
 
         /**
@@ -865,7 +882,7 @@ public class QuorumCnxManager {
             InetSocketAddress addr;
             Socket client = null;
             Exception exitException = null;
-            while((!shutdown) && (numRetries < 3)){
+            while((!shutdown) && (numRetries < portBindMaxRetry)){
                 try {
                     if (self.shouldUsePortUnification()) {
                         LOG.info("Creating TLS-enabled quorum server socket");
@@ -935,11 +952,14 @@ public class QuorumCnxManager {
             }
             LOG.info("Leaving listener");
             if (!shutdown) {
-                LOG.error("As I'm leaving the listener thread, "
-                        + "I won't be able to participate in leader "
-                        + "election any longer: "
-                        + formatInetAddr(self.getElectionAddress()));
-                if (exitException instanceof BindException) {
+                LOG.error("As I'm leaving the listener thread after "
+                          + numRetries + " errors. "
+                          + "I won't be able to participate in leader "
+                          + "election any longer: "
+                          + formatInetAddr(self.getElectionAddress())
+                          + ". Use " + ELECTION_PORT_BIND_RETRY + " property to "
+                          + "increase retry count.");
+                if (exitException instanceof SocketException) {
                     // After leaving listener thread, the host cannot join the
                     // quorum anymore, this is a severe error that we cannot
                     // recover from, so we need to exit
diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CnxManagerTest.java
b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CnxManagerTest.java
index 878e41b..200ed99 100644
--- a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CnxManagerTest.java
+++ b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CnxManagerTest.java
@@ -291,6 +291,35 @@ public class CnxManagerTest extends ZKTestCase {
     }
 
     /**
+     * Test for bug described in {@link https://issues.apache.org/jira/browse/ZOOKEEPER-3320}.
+     * Test create peer with address which contains unresolvable DNS name,
+     * leader election listener thread should stop after N errors.
+     *
+     * @throws Exception
+     */
+    @Test
+    public void testCnxManagerListenerThreadConfigurableRetry() throws Exception {
+        final Map<Long,QuorumServer> unresolvablePeers = new HashMap<>();
+        final long myid = 1L;
+        unresolvablePeers.put(myid, new QuorumServer(myid, "unresolvable-domain.org:2182:2183;2181"));
+        final QuorumPeer peer = new QuorumPeer(unresolvablePeers,
+                                               ClientBase.createTmpDir(),
+                                               ClientBase.createTmpDir(),
+                                               2181, 3, myid, 1000, 2, 2, 2);
+        final QuorumCnxManager cnxManager = peer.createCnxnManager();
+        QuorumCnxManager.Listener listener = cnxManager.listener;
+        listener.start();
+        // listener thread should stop and throws error which notify QuorumPeer about error.
+        // QuorumPeer should start shutdown process
+        listener.join(15000); // set wait time, if listener contains bug and thread not stops.
+        Assert.assertFalse(listener.isAlive());
+        Assert.assertFalse(peer.isRunning());
+        peer.join(15000);
+        Assert.assertFalse(QuorumPeer.class.getSimpleName() + " not stopped after "
+                           + "listener thread death", listener.isAlive());
+    }
+
+    /**
      * Tests a bug in QuorumCnxManager that causes a NPE when a 3.4.6
      * observer connects to a 3.5.0 server. 
      * see https://issues.apache.org/jira/browse/ZOOKEEPER-1789


Mime
View raw message