hbase-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From st...@apache.org
Subject hbase git commit: HBASE-17718 Difference between RS's servername and its ephemeral node cause SSH stop working
Date Wed, 08 Mar 2017 18:47:57 GMT
Repository: hbase
Updated Branches:
  refs/heads/branch-1 ba292ccbf -> ca5b8a44a


HBASE-17718 Difference between RS's servername and its ephemeral node cause SSH stop working

This patch reverts HBASE-9593 -- i.e. registering in zk before we
register with master putting it back to how it was where we register
in zk AFTER we report for duty with the master (because then we'll
register in zk with the name the master gave us). It then fixes the
problem reported in HBASE-9593 in an alternate fashion by checking
for a RS znode if we failed a connect on assign; if none found, we
remove a server from online servers list.

M hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
 Make move method available to tests.

M hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
 Correct method name changing moveFromOnelineToDeadServers to
 moveFromOnlineToDeadServers

 Add actual fix which is call to checkForRSznode if exception trying to
 open a region; if none found, call expire on the server so it gets
 removed from the list of online servers.

 This patch exposes sloppyness in the waitForRegionServers around our
 current case where Master is hosting regions but ONLY hbase:meta;
 in this case we need to wait on at least another server to report
 in beyond Master (we weren't but stuff was 'working' because of the
 early registration of RS nodes in zk).

M hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
 Make 'killed' available to tests.

 Put registry of ephemeral node back to where it was originally,
 so it is AFTER we get response from Master on registering for duty
 so we can put our znode up in zk with the name the Master gave us
 rather than local name (which could be unknown to the Master).
   private boolean stopping = false;

M hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenInitializing.java
 Cleanup and test of new cleanup.


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/ca5b8a44
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/ca5b8a44
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/ca5b8a44

Branch: refs/heads/branch-1
Commit: ca5b8a44a490143536625502d5d88918f338f562
Parents: ba292cc
Author: Michael Stack <stack@apache.org>
Authored: Sat Mar 4 00:21:19 2017 -0800
Committer: Michael Stack <stack@apache.org>
Committed: Wed Mar 8 10:47:53 2017 -0800

----------------------------------------------------------------------
 .../org/apache/hadoop/hbase/master/HMaster.java |  11 +-
 .../hadoop/hbase/master/ServerListener.java     |   9 +-
 .../hadoop/hbase/master/ServerManager.java      | 160 ++++++++++++-----
 .../hbase/regionserver/HRegionServer.java       |   8 +-
 .../hbase/zookeeper/DrainingServerTracker.java  |  30 ++--
 .../hbase/master/TestAssignmentListener.java    |   5 +
 .../procedure/TestMasterProcedureEvents.java    |   2 +-
 .../procedure/TestServerCrashProcedure.java     |   2 +-
 .../TestRSKilledWhenInitializing.java           | 176 ++++++++++++-------
 9 files changed, 278 insertions(+), 125 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/ca5b8a44/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
index febbf63..25c6504 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
@@ -299,7 +299,7 @@ public class HMaster extends HRegionServer implements MasterServices,
Server {
   MemoryBoundedLogMessageBuffer rsFatals;
 
   // flag set after we become the active master (used for testing)
-  private volatile boolean isActiveMaster = false;
+  private volatile boolean activeMaster = false;
 
   // flag set after we complete initialization once active,
   // it is not private since it's used in unit tests
@@ -574,7 +574,7 @@ public class HMaster extends HRegionServer implements MasterServices,
Server {
   @Override
   protected void waitForMasterActive(){
     boolean tablesOnMaster = BaseLoadBalancer.tablesOnMaster(conf);
-    while (!(tablesOnMaster && isActiveMaster)
+    while (!(tablesOnMaster && activeMaster)
         && !isStopped() && !isAborted()) {
       sleeper.sleep();
     }
@@ -721,7 +721,7 @@ public class HMaster extends HRegionServer implements MasterServices,
Server {
   private void finishActiveMasterInitialization(MonitoredTask status)
       throws IOException, InterruptedException, KeeperException, CoordinatedStateException
{
 
-    isActiveMaster = true;
+    activeMaster = true;
     Thread zombieDetector = new Thread(new InitializationMonitor(this),
         "ActiveMasterInitializationMonitor-" + System.currentTimeMillis());
     zombieDetector.start();
@@ -1623,7 +1623,8 @@ public class HMaster extends HRegionServer implements MasterServices,
Server {
       this.catalogJanitorChore, region_a, region_b, forcible, user));
   }
 
-  void move(final byte[] encodedRegionName,
+  @VisibleForTesting // Public so can be accessed by tests.
+  public void move(final byte[] encodedRegionName,
       final byte[] destServerName) throws HBaseIOException {
     RegionState regionState = assignmentManager.getRegionStates().
       getRegionState(Bytes.toString(encodedRegionName));
@@ -2612,7 +2613,7 @@ public class HMaster extends HRegionServer implements MasterServices,
Server {
    * @return true if active master, false if not.
    */
   public boolean isActiveMaster() {
-    return isActiveMaster;
+    return activeMaster;
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/hbase/blob/ca5b8a44/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerListener.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerListener.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerListener.java
index f168686..731eb8b 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerListener.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerListener.java
@@ -22,12 +22,17 @@ import org.apache.hadoop.hbase.classification.InterfaceAudience;
 import org.apache.hadoop.hbase.ServerName;
 
 /**
- * Get notification of server events. The invocations are inline
- * so make sure your implementation is fast else you'll slow hbase.
+ * Get notification of server registration events. The invocations are inline
+ * so make sure your implementation is fast or else you'll slow hbase.
  */
 @InterfaceAudience.Private
 public interface ServerListener {
   /**
+   * Started waiting on RegionServers to check-in.
+   */
+  void waiting();
+
+  /**
    * The server has joined the cluster.
    * @param serverName The remote servers name.
    */

http://git-wip-us.apache.org/repos/asf/hbase/blob/ca5b8a44/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
index 1817d6e..61790d0 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
@@ -19,6 +19,7 @@
 package org.apache.hadoop.hbase.master;
 
 import java.io.IOException;
+import java.net.ConnectException;
 import java.net.InetAddress;
 import java.util.ArrayList;
 import java.util.Collections;
@@ -33,7 +34,6 @@ import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentNavigableMap;
 import java.util.concurrent.ConcurrentSkipListMap;
 import java.util.concurrent.CopyOnWriteArrayList;
-
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -51,6 +51,7 @@ import org.apache.hadoop.hbase.classification.InterfaceAudience;
 import org.apache.hadoop.hbase.client.ClusterConnection;
 import org.apache.hadoop.hbase.client.ConnectionFactory;
 import org.apache.hadoop.hbase.client.RetriesExhaustedException;
+import org.apache.hadoop.hbase.ipc.FailedServerException;
 import org.apache.hadoop.hbase.ipc.PayloadCarryingRpcController;
 import org.apache.hadoop.hbase.ipc.RpcControllerFactory;
 import org.apache.hadoop.hbase.master.balancer.BaseLoadBalancer;
@@ -595,6 +596,11 @@ public class ServerManager {
     }
   }
 
+  private List<String> getRegionServersInZK(final ZooKeeperWatcher zkw)
+  throws KeeperException {
+    return ZKUtil.listChildrenNoWatch(zkw, zkw.rsZNode);
+  }
+
   /*
    * Expire the passed server.  Add it to list of dead servers and queue a
    * shutdown processing.
@@ -618,7 +624,7 @@ public class ServerManager {
           " but server shutdown already in progress");
       return;
     }
-    moveFromOnelineToDeadServers(serverName);
+    moveFromOnlineToDeadServers(serverName);
 
     // If cluster is going down, yes, servers are going to be expiring; don't
     // process as a dead server
@@ -648,7 +654,7 @@ public class ServerManager {
   }
 
   @VisibleForTesting
-  public void moveFromOnelineToDeadServers(final ServerName sn) {
+  public void moveFromOnlineToDeadServers(final ServerName sn) {
     synchronized (onlineServers) {
       if (!this.onlineServers.containsKey(sn)) {
         LOG.warn("Expiration of " + sn + " but server not online");
@@ -776,11 +782,66 @@ public class ServerManager {
       OpenRegionResponse response = admin.openRegion(null, request);
       return ResponseConverter.getRegionOpeningState(response);
     } catch (ServiceException se) {
+      checkForRSznode(server, se);
       throw ProtobufUtil.getRemoteException(se);
     }
   }
 
   /**
+   * Check for an odd state, where we think an RS is up but it is not. Do it on OPEN.
+   * This is only case where the check makes sense.
+   *
+   * <p>We are checking for instance of HBASE-9593 where a RS registered but died before
it put
+   * up its znode in zk. In this case, the RS made it into the list of online servers but
it
+   * is not actually UP. We do the check here where there is an evident problem rather
+   * than do some crazy footwork where we'd have master check zk after a RS had reported
+   * for duty with provisional state followed by a confirmed state; that'd be a mess.
+   * Real fix is HBASE-17733.
+   */
+  private void checkForRSznode(final ServerName serverName, final ServiceException se) {
+    if (se.getCause() == null) return;
+    Throwable t = se.getCause();
+    if (t instanceof ConnectException) {
+      // If this, proceed to do cleanup.
+    } else {
+      // Look for FailedServerException
+      if (!(t instanceof IOException)) return;
+      if (t.getCause() == null) return;
+      if (!(t.getCause() instanceof FailedServerException)) return;
+      // Ok, found FailedServerException -- continue.
+    }
+    if (!isServerOnline(serverName)) return;
+    // We think this server is online. Check it has a znode up. Currently, a RS
+    // registers an ephereral znode in zk. If not present, something is up. Maybe
+    // HBASE-9593 where RS crashed AFTER reportForDuty but BEFORE it put up an ephemeral
+    // znode.
+    List<String> servers = null;
+    try {
+      servers = getRegionServersInZK(this.master.getZooKeeper());
+    } catch (KeeperException ke) {
+      LOG.warn("Failed to list regionservers", ke);
+      // ZK is malfunctioning, don't hang here
+    }
+    boolean found = false;
+    if (servers != null) {
+      for (String serverNameAsStr: servers) {
+        ServerName sn = ServerName.valueOf(serverNameAsStr);
+        if (sn.equals(serverName)) {
+          // Found a server up in zk.
+          found = true;
+          break;
+        }
+      }
+    }
+    if (!found) {
+      LOG.warn("Online server " + serverName.toString() + " has no corresponding " +
+        "ephemeral znode (Did it die before registering in zk?); " +
+          "calling expire to clean it up!");
+      expireServer(serverName);
+    }
+  }
+
+  /**
    * Sends an OPEN RPC to the specified server to open the specified region.
    * <p>
    * Open should not fail but can if server just crashed.
@@ -805,6 +866,7 @@ public class ServerManager {
       OpenRegionResponse response = admin.openRegion(null, request);
       return ResponseConverter.getRegionOpeningStateList(response);
     } catch (ServiceException se) {
+      checkForRSznode(server, se);
       throw ProtobufUtil.getRemoteException(se);
     }
   }
@@ -992,6 +1054,27 @@ public class ServerManager {
   }
 
   /**
+   * Calculate min necessary to start. This is not an absolute. It is just
+   * a friction that will cause us hang around a bit longer waiting on
+   * RegionServers to check-in.
+   */
+  private int getMinToStart() {
+    // One server should be enough to get us off the ground.
+    int requiredMinToStart = 1;
+    if (BaseLoadBalancer.tablesOnMaster(master.getConfiguration())) {
+      if (!BaseLoadBalancer.userTablesOnMaster(master.getConfiguration())) {
+        // If Master is carrying regions but NOT user-space regions (the current default),
+        // since the Master shows as a 'server', we need at least one more server to check
+        // in before we can start up so up defaultMinToStart to 2.
+        requiredMinToStart = 2;
+      }
+    }
+    int minToStart = this.master.getConfiguration().getInt(WAIT_ON_REGIONSERVERS_MINTOSTART,
-1);
+    // Ensure we are never less than requiredMinToStart else stuff won't work.
+    return minToStart == -1 || minToStart < requiredMinToStart? requiredMinToStart: minToStart;
+  }
+
+  /**
    * Wait for the region servers to report in.
    * We will wait until one of this condition is met:
    *  - the master is stopped
@@ -1005,35 +1088,20 @@ public class ServerManager {
    * @throws InterruptedException
    */
   public void waitForRegionServers(MonitoredTask status)
-  throws InterruptedException {
+      throws InterruptedException {
     final long interval = this.master.getConfiguration().
-      getLong(WAIT_ON_REGIONSERVERS_INTERVAL, 1500);
+        getLong(WAIT_ON_REGIONSERVERS_INTERVAL, 1500);
     final long timeout = this.master.getConfiguration().
-      getLong(WAIT_ON_REGIONSERVERS_TIMEOUT, 4500);
-    int defaultMinToStart = 1;
-    if (BaseLoadBalancer.tablesOnMaster(master.getConfiguration())) {
-      // If we assign regions to master, we'd like to start
-      // at least another region server so that we don't
-      // assign all regions to master if other region servers
-      // don't come up in time.
-      defaultMinToStart = 2;
-    }
-    int minToStart = this.master.getConfiguration().
-      getInt(WAIT_ON_REGIONSERVERS_MINTOSTART, defaultMinToStart);
-    if (minToStart < 1) {
-      LOG.warn(String.format(
-        "The value of '%s' (%d) can not be less than 1, ignoring.",
-        WAIT_ON_REGIONSERVERS_MINTOSTART, minToStart));
-      minToStart = 1;
-    }
+        getLong(WAIT_ON_REGIONSERVERS_TIMEOUT, 4500);
+    // Min is not an absolute; just a friction making us wait longer on server checkin.
+    int minToStart = getMinToStart();
     int maxToStart = this.master.getConfiguration().
-      getInt(WAIT_ON_REGIONSERVERS_MAXTOSTART, Integer.MAX_VALUE);
+        getInt(WAIT_ON_REGIONSERVERS_MAXTOSTART, Integer.MAX_VALUE);
     if (maxToStart < minToStart) {
-        LOG.warn(String.format(
-            "The value of '%s' (%d) is set less than '%s' (%d), ignoring.",
-            WAIT_ON_REGIONSERVERS_MAXTOSTART, maxToStart,
-            WAIT_ON_REGIONSERVERS_MINTOSTART, minToStart));
-        maxToStart = Integer.MAX_VALUE;
+      LOG.warn(String.format("The value of '%s' (%d) is set less than '%s' (%d), ignoring.",
+          WAIT_ON_REGIONSERVERS_MAXTOSTART, maxToStart,
+          WAIT_ON_REGIONSERVERS_MINTOSTART, minToStart));
+      maxToStart = Integer.MAX_VALUE;
     }
 
     long now =  System.currentTimeMillis();
@@ -1043,16 +1111,25 @@ public class ServerManager {
     long lastCountChange = startTime;
     int count = countOfRegionServers();
     int oldCount = 0;
-    while (!this.master.isStopped() && count < maxToStart
-        && (lastCountChange+interval > now || timeout > slept || count <
minToStart)) {
+    // This while test is a little hard to read. We try to comment it in below but in essence:
+    // Wait if Master is not stopped and the number of regionservers that have checked-in
is
+    // less than the maxToStart. Both of these conditions will be true near universally.
+    // Next, we will keep cycling if ANY of the following three conditions are true:
+    // 1. The time since a regionserver registered is < interval (means servers are actively
checking in).
+    // 2. We are under the total timeout.
+    // 3. The count of servers is < minimum.
+    for (ServerListener listener: this.listeners) {
+      listener.waiting();
+    }
+    while (!this.master.isStopped() && count < maxToStart &&
+        ((lastCountChange + interval) > now || timeout > slept || count < minToStart))
{
       // Log some info at every interval time or if there is a change
-      if (oldCount != count || lastLogTime+interval < now){
+      if (oldCount != count || lastLogTime + interval < now) {
         lastLogTime = now;
         String msg =
-          "Waiting for region servers count to settle; currently"+
-            " checked in " + count + ", slept for " + slept + " ms," +
-            " expecting minimum of " + minToStart + ", maximum of "+ maxToStart+
-            ", timeout of "+timeout+" ms, interval of "+interval+" ms.";
+            "Waiting on RegionServer count=" + count + " to settle; waited="+
+                slept + "ms, expecting min=" + minToStart + " server(s), max="+ getStrForMax(maxToStart)
+
+                " server(s), " + "timeout=" + timeout + "ms, lastChange=" + (lastCountChange
- now) + "ms";
         LOG.info(msg);
         status.setStatus(msg);
       }
@@ -1070,11 +1147,14 @@ public class ServerManager {
       }
     }
 
-    LOG.info("Finished waiting for region servers count to settle;" +
-      " checked in " + count + ", slept for " + slept + " ms," +
-      " expecting minimum of " + minToStart + ", maximum of "+ maxToStart+","+
-      " master is "+ (this.master.isStopped() ? "stopped.": "running")
-    );
+    LOG.info("Finished wait on RegionServer count=" + count + "; waited=" + slept + "ms,"
+
+        " expected min=" + minToStart + " server(s), max=" +  getStrForMax(maxToStart) +
" server(s),"+
+        " master is "+ (this.master.isStopped() ? "stopped.": "running")
+        );
+  }
+
+  private String getStrForMax(final int max) {
+    return max == Integer.MAX_VALUE? "NO_LIMIT": Integer.toString(max);
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/hbase/blob/ca5b8a44/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
index 11621e2..3544757 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
@@ -315,7 +315,7 @@ public class HRegionServer extends HasThread implements
   // space regions.
   private boolean stopping = false;
 
-  private volatile boolean killed = false;
+  volatile boolean killed = false;
 
   protected final Configuration conf;
 
@@ -940,8 +940,6 @@ public class HRegionServer extends HasThread implements
     try {
       if (!isStopped() && !isAborted()) {
         ShutdownHook.install(conf, fs, this, Thread.currentThread());
-        // Set our ephemeral znode up in zookeeper now we have a name.
-        createMyEphemeralNode();
         // Initialize the RegionServerCoprocessorHost now that our ephemeral
         // node was created, in case any coprocessors want to use ZooKeeper
         this.rsHost = new RegionServerCoprocessorHost(this, this.conf);
@@ -1417,6 +1415,9 @@ public class HRegionServer extends HasThread implements
         }
         this.conf.set(key, value);
       }
+      // Set our ephemeral znode up in zookeeper now we have a name.
+      createMyEphemeralNode();
+
       if (updateRootDir) {
         // initialize file system by the config fs.defaultFS and hbase.rootdir from master
         initializeFileSystem();
@@ -2180,6 +2181,7 @@ public class HRegionServer extends HasThread implements
    * logs but it does close socket in case want to bring up server on old
    * hostname+port immediately.
    */
+  @VisibleForTesting
   protected void kill() {
     this.killed = true;
     abort("Simulated kill");

http://git-wip-us.apache.org/repos/asf/hbase/blob/ca5b8a44/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/DrainingServerTracker.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/DrainingServerTracker.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/DrainingServerTracker.java
index 413f226..7529aaa 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/DrainingServerTracker.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/DrainingServerTracker.java
@@ -70,22 +70,24 @@ public class DrainingServerTracker extends ZooKeeperListener {
   public void start() throws KeeperException, IOException {
     watcher.registerListener(this);
     // Add a ServerListener to check if a server is draining when it's added.
-    serverManager.registerListener(
-        new ServerListener() {
+    serverManager.registerListener(new ServerListener() {
+      @Override
+      public void serverAdded(ServerName sn) {
+        if (drainingServers.contains(sn)){
+          serverManager.addServerToDrainList(sn);
+        }
+      }
 
-          @Override
-          public void serverAdded(ServerName sn) {
-            if (drainingServers.contains(sn)){
-              serverManager.addServerToDrainList(sn);
-            }
-          }
+      @Override
+      public void waiting() {
+        // TODO Auto-generated method stub
+      }
 
-          @Override
-          public void serverRemoved(ServerName serverName) {
-            // no-op
-          }
-        }
-    );
+      @Override
+      public void serverRemoved(ServerName serverName) {
+        // TODO Auto-generated method stub
+      }
+    });
     List<String> servers =
       ZKUtil.listChildrenAndWatchThem(watcher, watcher.drainingZNode);
     add(servers);

http://git-wip-us.apache.org/repos/asf/hbase/blob/ca5b8a44/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentListener.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentListener.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentListener.java
index f171821..f582ee6 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentListener.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentListener.java
@@ -151,6 +151,11 @@ public class TestAssignmentListener {
     public int getRemovedCount() {
       return removedCount.get();
     }
+
+    @Override
+    public void waiting() {
+      // TODO Auto-generated method stub
+    }
   }
 
   @BeforeClass

http://git-wip-us.apache.org/repos/asf/hbase/blob/ca5b8a44/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestMasterProcedureEvents.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestMasterProcedureEvents.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestMasterProcedureEvents.java
index c87794b..b7a6434 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestMasterProcedureEvents.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestMasterProcedureEvents.java
@@ -140,7 +140,7 @@ public class TestMasterProcedureEvents {
     while (!master.getServerManager().isServerDead(hrs.getServerName())) Thread.sleep(10);
 
     // Do some of the master processing of dead servers so when SCP runs, it has expected
'state'.
-    master.getServerManager().moveFromOnelineToDeadServers(hrs.getServerName());
+    master.getServerManager().moveFromOnlineToDeadServers(hrs.getServerName());
 
     long procId = procExec.submitProcedure(
       new ServerCrashProcedure(procExec.getEnvironment(), hrs.getServerName(), true, carryingMeta));

http://git-wip-us.apache.org/repos/asf/hbase/blob/ca5b8a44/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerCrashProcedure.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerCrashProcedure.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerCrashProcedure.java
index 3fc28891..e226769 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerCrashProcedure.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerCrashProcedure.java
@@ -115,7 +115,7 @@ public class TestServerCrashProcedure {
       // Now, reenable processing else we can't get a lock on the ServerCrashProcedure.
       master.setServerCrashProcessingEnabled(true);
       // Do some of the master processing of dead servers so when SCP runs, it has expected
'state'.
-      master.getServerManager().moveFromOnelineToDeadServers(hrs.getServerName());
+      master.getServerManager().moveFromOnlineToDeadServers(hrs.getServerName());
       // Enable test flags and then queue the crash procedure.
       ProcedureTestingUtility.waitNoProcedureRunning(procExec);
       ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, true);

http://git-wip-us.apache.org/repos/asf/hbase/blob/ca5b8a44/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenInitializing.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenInitializing.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenInitializing.java
index 9a48db7..8dd2457 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenInitializing.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenInitializing.java
@@ -19,115 +19,173 @@
 package org.apache.hadoop.hbase.regionserver;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
 
 import java.io.IOException;
-import java.net.InetSocketAddress;
 import java.util.List;
+import java.util.Map;
 import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicReference;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.CategoryBasedTimeout;
 import org.apache.hadoop.hbase.CoordinatedStateManager;
 import org.apache.hadoop.hbase.HBaseConfiguration;
 import org.apache.hadoop.hbase.HBaseTestingUtility;
-import org.apache.hadoop.hbase.HConstants;
-import org.apache.hadoop.hbase.testclassification.LargeTests;
+import org.apache.hadoop.hbase.HRegionInfo;
 import org.apache.hadoop.hbase.LocalHBaseCluster;
 import org.apache.hadoop.hbase.MiniHBaseCluster;
 import org.apache.hadoop.hbase.ServerName;
 import org.apache.hadoop.hbase.master.HMaster;
+import org.apache.hadoop.hbase.master.ServerListener;
 import org.apache.hadoop.hbase.master.ServerManager;
-import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.NameStringPair;
 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionServerStartupResponse;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.apache.hadoop.hbase.testclassification.RegionServerTests;
+import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
 import org.apache.hadoop.hbase.util.Threads;
+import org.junit.Rule;
 import org.junit.Test;
 import org.junit.experimental.categories.Category;
+import org.junit.rules.TestName;
+import org.junit.rules.TestRule;
 
 /**
- * Tests region server termination during startup.
+ * Tests that a regionserver that dies after reporting for duty gets removed
+ * from list of online regions. See HBASE-9593.
  */
-@Category(LargeTests.class)
+@Category({RegionServerTests.class, MediumTests.class})
 public class TestRSKilledWhenInitializing {
-  private static boolean masterActive = false;
-  private static AtomicBoolean firstRS = new AtomicBoolean(true);
+  private static final Log LOG = LogFactory.getLog(TestRSKilledWhenInitializing.class);
+  @Rule public TestName testName = new TestName();
+  @Rule public final TestRule timeout = CategoryBasedTimeout.builder().
+    withTimeout(this.getClass()).withLookingForStuckThread(true).build();
+
+  // This boolean needs to be globally available. It is used below in our
+  // mocked up regionserver so it knows when to die.
+  private static AtomicBoolean masterActive = new AtomicBoolean(false);
+  // Ditto for this variable. It also is used in the mocked regionserver class.
+  private static final AtomicReference<ServerName> killedRS = new AtomicReference<ServerName>();
+
+  private static final int NUM_MASTERS = 1;
+  private static final int NUM_RS = 2;
 
   /**
    * Test verifies whether a region server is removing from online servers list in master
if it went
-   * down after registering with master.
+   * down after registering with master. Test will TIMEOUT if an error!!!!
    * @throws Exception
    */
-  @Test(timeout = 180000)
-  public void testRSTermnationAfterRegisteringToMasterBeforeCreatingEphemeralNod() throws
Exception {
-
-    final int NUM_MASTERS = 1;
-    final int NUM_RS = 2;
-    firstRS.set(true);
+  @Test
+  public void testRSTerminationAfterRegisteringToMasterBeforeCreatingEphemeralNode()
+  throws Exception {
     // Create config to use for this cluster
     Configuration conf = HBaseConfiguration.create();
     conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1);
-
     // Start the cluster
     final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
     TEST_UTIL.startMiniDFSCluster(3);
     TEST_UTIL.startMiniZKCluster();
     TEST_UTIL.createRootDir();
     final LocalHBaseCluster cluster =
-        new LocalHBaseCluster(conf, NUM_MASTERS, NUM_RS, HMaster.class, MockedRegionServer.class);
-    final MasterThread master = cluster.getMasters().get(0);
-    master.start();
+        new LocalHBaseCluster(conf, NUM_MASTERS, NUM_RS, HMaster.class,
+            RegisterAndDieRegionServer.class);
+    final MasterThread master = startMaster(cluster.getMasters().get(0));
     try {
-      long startTime = System.currentTimeMillis();
-      while (!master.getMaster().isActiveMaster()) {
-        try {
-          Thread.sleep(100);
-        } catch (InterruptedException ignored) {
-        }
-        if (System.currentTimeMillis() > startTime + 30000) {
-          throw new RuntimeException("Master not active after 30 seconds");
-        }
+      // Master is up waiting on RegionServers to check in. Now start RegionServers.
+      for (int i = 0; i < NUM_RS; i++) {
+        cluster.getRegionServers().get(i).start();
       }
-      masterActive = true;
-      cluster.getRegionServers().get(0).start();
-      cluster.getRegionServers().get(1).start();
-      Thread.sleep(10000);
-      List<ServerName> onlineServersList =
-          master.getMaster().getServerManager().getOnlineServersList();
-      while (onlineServersList.size() > 1) {
-        Thread.sleep(100);
+      // Now wait on master to see NUM_RS + 1 servers as being online, thats NUM_RS plus
+      // the Master itself (because Master hosts hbase:meta and checks in as though it a
RS).
+      List<ServerName> onlineServersList = null;
+      do {
         onlineServersList = master.getMaster().getServerManager().getOnlineServersList();
+      } while (onlineServersList.size() < NUM_RS);
+      // Wait until killedRS is set. Means RegionServer is starting to go down.
+      while (killedRS.get() == null) {
+        Threads.sleep(1);
       }
-      assertEquals(onlineServersList.size(), 1);
-      cluster.shutdown();
+      // Wait on the RegionServer to fully die.
+      while (cluster.getLiveRegionServers().size() >= NUM_RS) {
+        Threads.sleep(1);
+      }
+      // Make sure Master is fully up before progressing. Could take a while if regions
+      // being reassigned.
+      while (!master.getMaster().isInitialized()) {
+        Threads.sleep(1);
+      }
+
+      // Now in steady state. Make sure the killed RS is no longer registered.
+      // branch-1 works differently to master branch.
+      assertTrue(!master.getMaster().getServerManager().isServerOnline(killedRS.get()));
     } finally {
-      masterActive = false;
-      firstRS.set(true);
-      TEST_UTIL.shutdownMiniCluster();
+      cluster.shutdown();
+      cluster.join();
+      TEST_UTIL.shutdownMiniDFSCluster();
+      TEST_UTIL.shutdownMiniZKCluster();
+      TEST_UTIL.cleanupTestDir();
     }
   }
 
-  public static class MockedRegionServer extends MiniHBaseCluster.MiniHBaseClusterRegionServer
{
+  /**
+   * Start Master. Get as far as the state where Master is waiting on
+   * RegionServers to check in, then return.
+   */
+  private MasterThread startMaster(MasterThread master) {
+    master.start();
+    // It takes a while until ServerManager creation to happen inside Master startup.
+    while (master.getMaster().getServerManager() == null) {
+      continue;
+    }
+    // Set a listener for the waiting-on-RegionServers state. We want to wait
+    // until this condition before we leave this method and start regionservers.
+    final AtomicBoolean waiting = new AtomicBoolean(false);
+    if (master.getMaster().getServerManager() == null) throw new NullPointerException("SM");
+    master.getMaster().getServerManager().registerListener(new ServerListener() {
+      @Override
+      public void waiting() {
+        waiting.set(true);
+      }
 
-    public MockedRegionServer(Configuration conf, CoordinatedStateManager cp)
-      throws IOException, InterruptedException {
+      @Override
+      public void serverAdded(ServerName serverName) {
+        // TODO Auto-generated method stub
+      }
+
+      @Override
+      public void serverRemoved(ServerName serverName) {
+        // TODO Auto-generated method stub
+      }
+    });
+    // Wait until the Master gets to place where it is waiting on RegionServers to check
in.
+    while (!waiting.get()) {
+      continue;
+    }
+    // Set the global master-is-active; gets picked up by regionservers later.
+    masterActive.set(true);
+    return master;
+  }
+
+  /**
+   * A RegionServer that reports for duty and then immediately dies if it is the first to
receive
+   * the response to a reportForDuty. When it dies, it clears its ephemeral znode which the
master
+   * notices and so removes the region from its set of online regionservers.
+   */
+  static class RegisterAndDieRegionServer extends MiniHBaseCluster.MiniHBaseClusterRegionServer
{
+    public RegisterAndDieRegionServer(Configuration conf, CoordinatedStateManager cp)
+    throws IOException, InterruptedException {
       super(conf, cp);
     }
 
     @Override
-    protected void handleReportForDutyResponse(RegionServerStartupResponse c) throws IOException
{
-      if (firstRS.getAndSet(false)) {
-        InetSocketAddress address = super.getRpcServer().getListenerAddress();
-        if (address == null) {
-          throw new IOException("Listener channel is closed");
-        }
-        for (NameStringPair e : c.getMapEntriesList()) {
-          String key = e.getName();
-          // The hostname the master sees us as.
-          if (key.equals(HConstants.KEY_FOR_HOSTNAME_SEEN_BY_MASTER)) {
-            String hostnameFromMasterPOV = e.getValue();
-            assertEquals(address.getHostName(), hostnameFromMasterPOV);
-          }
-        }
-        while (!masterActive) {
+    protected void handleReportForDutyResponse(RegionServerStartupResponse c)
+    throws IOException {
+      if (killedRS.compareAndSet(null, getServerName())) {
+        // Make sure Master is up so it will see the removal of the ephemeral znode for this
RS.
+        while (!masterActive.get()) {
           Threads.sleep(100);
         }
         super.kill();


Mime
View raw message