hbase-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From st...@apache.org
Subject hbase git commit: HBASE-14498 Master stuck in infinite loop when all Zookeeper servers are unreachable
Date Fri, 18 Aug 2017 21:25:10 GMT
Repository: hbase
Updated Branches:
  refs/heads/branch-1.3 bc925b4e8 -> 8b977d778


HBASE-14498 Master stuck in infinite loop when all Zookeeper servers are unreachable

Signed-off-by: Michael Stack <stack@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/8b977d77
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/8b977d77
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/8b977d77

Branch: refs/heads/branch-1.3
Commit: 8b977d77851cb3b63b6eff32277bc182128b8201
Parents: bc925b4
Author: Pankaj Kumar <pankaj.kr@huawei.com>
Authored: Fri Aug 18 23:44:52 2017 +0800
Committer: Michael Stack <stack@apache.org>
Committed: Fri Aug 18 14:25:03 2017 -0700

----------------------------------------------------------------------
 .../hbase/zookeeper/ZooKeeperWatcher.java       | 91 ++++++++++++++++++--
 .../hbase/zookeeper/TestZooKeeperWatcher.java   | 48 +++++++++++
 .../hbase/regionserver/HRegionServer.java       |  4 +-
 3 files changed, 135 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/8b977d77/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java
----------------------------------------------------------------------
diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java
b/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java
index f5fa0b7..36aee61 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java
@@ -26,6 +26,9 @@ import java.util.List;
 import java.util.Map;
 import java.util.concurrent.CopyOnWriteArrayList;
 import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
@@ -38,8 +41,8 @@ import org.apache.hadoop.hbase.AuthUtil;
 import org.apache.hadoop.hbase.HConstants;
 import org.apache.hadoop.hbase.HRegionInfo;
 import org.apache.hadoop.hbase.ZooKeeperConnectionException;
-import org.apache.hadoop.hbase.classification.InterfaceAudience;
 import org.apache.hadoop.hbase.security.Superusers;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
 import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.zookeeper.KeeperException;
 import org.apache.zookeeper.WatchedEvent;
@@ -78,7 +81,7 @@ public class ZooKeeperWatcher implements Watcher, Abortable, Closeable {
   private final RecoverableZooKeeper recoverableZooKeeper;
 
   // abortable in case of zk failure
-  protected Abortable abortable;
+  protected final Abortable abortable;
   // Used if abortable is null
   private boolean aborted = false;
 
@@ -90,6 +93,14 @@ public class ZooKeeperWatcher implements Watcher, Abortable, Closeable
{
   // negotiation to complete
   public CountDownLatch saslLatch = new CountDownLatch(1);
 
+  // Connection timeout on disconnect event
+  private long connWaitTimeOut;
+  private AtomicBoolean connected = new AtomicBoolean(false);
+  private boolean forceAbortOnZKDisconnect;
+
+  // Execute service for zookeeper disconnect event watcher
+  private ExecutorService zkEventWatcherExecService = null;
+
   // node names
 
   // base znode for this cluster
@@ -164,9 +175,26 @@ public class ZooKeeperWatcher implements Watcher, Abortable, Closeable
{
    * @throws IOException
    * @throws ZooKeeperConnectionException
    */
-  public ZooKeeperWatcher(Configuration conf, String identifier,
-      Abortable abortable, boolean canCreateBaseZNode)
-  throws IOException, ZooKeeperConnectionException {
+  public ZooKeeperWatcher(Configuration conf, String identifier, Abortable abortable,
+      boolean canCreateBaseZNode) throws IOException, ZooKeeperConnectionException {
+    this(conf, identifier, abortable, canCreateBaseZNode, false);
+  }
+
+  /**
+   * Instantiate a ZooKeeper connection and watcher.
+   * @param conf Configuration
+   * @param identifier string that is passed to RecoverableZookeeper to be used as identifier
for
+   *          this instance. Use null for default.
+   * @param abortable Can be null if there is on error there is no host to abort: e.g. client
+   *          context.
+   * @param canCreateBaseZNode whether create base node.
+   * @param forceAbortOnZKDisconnect abort the watcher if true.
+   * @throws IOException when any IO exception
+   * @throws ZooKeeperConnectionException when any zookeeper connection exception
+   */
+  public ZooKeeperWatcher(Configuration conf, String identifier, Abortable abortable,
+      boolean canCreateBaseZNode, boolean forceAbortOnZKDisconnect)
+          throws IOException, ZooKeeperConnectionException {
     this.conf = conf;
     this.quorum = ZKConfig.getZKQuorumServersString(conf);
     this.prefix = identifier;
@@ -175,6 +203,9 @@ public class ZooKeeperWatcher implements Watcher, Abortable, Closeable
{
     this.identifier = identifier + "0x0";
     this.abortable = abortable;
     setNodeNames(conf);
+    // On Disconnected event a thread will wait for sometime (2/3 of zookeeper.session.timeout),
+    // it will abort the process if no SyncConnected event reported by the time.
+    connWaitTimeOut = this.conf.getLong("zookeeper.session.timeout", 90000) * 2 / 3;
     PendingWatcher pendingWatcher = new PendingWatcher();
     this.recoverableZooKeeper = ZKUtil.connect(conf, quorum, pendingWatcher, identifier);
     pendingWatcher.prepare(this);
@@ -640,6 +671,10 @@ public class ZooKeeperWatcher implements Watcher, Abortable, Closeable
{
         break;
       }
     }
+    this.forceAbortOnZKDisconnect = forceAbortOnZKDisconnect;
+    if (this.forceAbortOnZKDisconnect) {
+      this.zkEventWatcherExecService = Executors.newSingleThreadExecutor();
+    }
   }
 
   // Connection management
@@ -661,11 +696,19 @@ public class ZooKeeperWatcher implements Watcher, Abortable, Closeable
{
           Long.toHexString(this.recoverableZooKeeper.getSessionId());
         // Update our identifier.  Otherwise ignore.
         LOG.debug(this.identifier + " connected");
+        connected.set(true);
         break;
 
       // Abort the server if Disconnected or Expired
       case Disconnected:
-        LOG.debug(prefix("Received Disconnected from ZooKeeper, ignoring"));
+        LOG.debug(prefix("Received Disconnected from ZooKeeper."));
+        if (forceAbortOnZKDisconnect) {
+          connected.set(false);
+          ZKDisconnectEventWatcher task = new ZKDisconnectEventWatcher();
+          zkEventWatcherExecService.execute(task);
+        } else {
+          LOG.debug(prefix("Received Disconnected from ZooKeeper, ignoring."));
+        }
         break;
 
       case Expired:
@@ -688,6 +731,38 @@ public class ZooKeeperWatcher implements Watcher, Abortable, Closeable
{
     }
   }
 
+  /*
+   * Task to watch zookeper disconnect event.
+   */
+  class ZKDisconnectEventWatcher implements Runnable {
+    @Override
+    public void run() {
+      if (connected.get()) {
+        return;
+      }
+
+      long startTime = EnvironmentEdgeManager.currentTime();
+      while (EnvironmentEdgeManager.currentTime() - startTime < connWaitTimeOut) {
+        if (connected.get()) {
+          LOG.debug(prefix("Client got reconnected to zookeeper."));
+          return;
+        }
+        try {
+          Thread.sleep(100);
+        } catch (InterruptedException e) {
+          Thread.currentThread().interrupt();
+          break;
+        }
+      }
+
+      if (!connected.get() && abortable != null) {
+        String msg = prefix(
+          "Couldn't connect to ZooKeeper after waiting " + connWaitTimeOut + " ms, aborting");
+        abortable.abort(msg, new KeeperException.ConnectionLossException());
+      }
+    }
+  }
+
   /**
    * Forces a synchronization of this ZooKeeper client connection.
    * <p>
@@ -746,6 +821,10 @@ public class ZooKeeperWatcher implements Watcher, Abortable, Closeable
{
   public void close() {
     try {
       recoverableZooKeeper.close();
+      if (zkEventWatcherExecService != null) {
+        zkEventWatcherExecService.shutdown();
+        zkEventWatcherExecService = null;
+      }
     } catch (InterruptedException e) {
       Thread.currentThread().interrupt();
     }

http://git-wip-us.apache.org/repos/asf/hbase/blob/8b977d77/hbase-client/src/test/java/org/apache/hadoop/hbase/zookeeper/TestZooKeeperWatcher.java
----------------------------------------------------------------------
diff --git a/hbase-client/src/test/java/org/apache/hadoop/hbase/zookeeper/TestZooKeeperWatcher.java
b/hbase-client/src/test/java/org/apache/hadoop/hbase/zookeeper/TestZooKeeperWatcher.java
index 10a3816..d4d132b 100644
--- a/hbase-client/src/test/java/org/apache/hadoop/hbase/zookeeper/TestZooKeeperWatcher.java
+++ b/hbase-client/src/test/java/org/apache/hadoop/hbase/zookeeper/TestZooKeeperWatcher.java
@@ -22,14 +22,22 @@ import static org.junit.Assert.*;
 
 import java.io.IOException;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.Abortable;
 import org.apache.hadoop.hbase.HBaseConfiguration;
 import org.apache.hadoop.hbase.ZooKeeperConnectionException;
 import org.apache.hadoop.hbase.testclassification.SmallTests;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.zookeeper.WatchedEvent;
+import org.apache.zookeeper.Watcher;
 import org.junit.Test;
 import org.junit.experimental.categories.Category;
 
 @Category({SmallTests.class})
 public class TestZooKeeperWatcher {
+  private final static Log LOG = LogFactory.getLog(TestZooKeeperWatcher.class);
 
   @Test
   public void testIsClientReadable() throws ZooKeeperConnectionException, IOException {
@@ -57,4 +65,44 @@ public class TestZooKeeperWatcher {
     watcher.close();
   }
 
+  @Test
+  public void testConnectionEvent() throws ZooKeeperConnectionException, IOException {
+    long zkSessionTimeout = 15000l;
+    Configuration conf = HBaseConfiguration.create();
+    conf.set("zookeeper.session.timeout", "15000");
+
+    Abortable abortable = new Abortable() {
+      boolean aborted = false;
+
+      @Override
+      public void abort(String why, Throwable e) {
+        aborted = true;
+        LOG.error(why, e);
+      }
+
+      @Override
+      public boolean isAborted() {
+        return aborted;
+      }
+    };
+    ZooKeeperWatcher watcher =
+        new ZooKeeperWatcher(conf, "testConnectionEvent", abortable, false, true);
+
+    WatchedEvent event = new WatchedEvent(Watcher.Event.EventType.None,
+        Watcher.Event.KeeperState.Disconnected, null);
+
+    long startTime = EnvironmentEdgeManager.currentTime();
+    while (!abortable.isAborted()
+        && (EnvironmentEdgeManager.currentTime() - startTime < zkSessionTimeout))
{
+      watcher.process(event);
+      try {
+        Thread.sleep(1000);
+      } catch (InterruptedException e) {
+        Thread.currentThread().interrupt();
+      }
+    }
+
+    assertTrue(abortable.isAborted());
+    watcher.close();
+  }
 }

http://git-wip-us.apache.org/repos/asf/hbase/blob/8b977d77/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
index 54934f3..d997ea3 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
@@ -596,8 +596,8 @@ public class HRegionServer extends HasThread implements
     // Some unit tests don't need a cluster, so no zookeeper at all
     if (!conf.getBoolean("hbase.testing.nocluster", false)) {
       // Open connection to zookeeper and set primary watcher
-      zooKeeper = new ZooKeeperWatcher(conf, getProcessName() + ":" +
-        rpcServices.isa.getPort(), this, canCreateBaseZNode());
+      zooKeeper = new ZooKeeperWatcher(conf, getProcessName() + ":" + rpcServices.isa.getPort(),
+          this, canCreateBaseZNode(), true);
 
       this.csm = (BaseCoordinatedStateManager) csm;
       this.csm.initialize(this);


Mime
View raw message