hbase-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mbau...@apache.org
Subject svn commit: r1365720 - in /hbase/branches/0.89-fb/src: main/java/org/apache/hadoop/hbase/master/ main/java/org/apache/hadoop/hbase/regionserver/ main/java/org/apache/hadoop/hbase/util/ test/java/org/apache/hadoop/hbase/master/
Date Wed, 25 Jul 2012 20:27:34 GMT
Author: mbautin
Date: Wed Jul 25 20:27:33 2012
New Revision: 1365720

URL: http://svn.apache.org/viewvc?rev=1365720&view=rev
Log:
[master] [89-fb] Fix TestMasterTransitions and make mini-cluster test debugging easier

Author: mbautin

Summary:
TestMasterTransitions started failing when we moved RPC server initialization from the constructor
to the run method. The test keeps trying to start a regionserver on the same port in a loop,
and the old way of handling the bind exception stopped working.

Also changing regionserver thread names to include the server host, port, and generation stamp
and doing some other minor code cleanup.

Test Plan:
Run TestMasterTransitions 100 times
Run all unit tests

Reviewers: pkhemani, liyintang, kranganathan

Reviewed By: pkhemani

Differential Revision: https://reviews.facebook.net/D4323

Modified:
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/Threads.java
    hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestMasterTransitions.java

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java?rev=1365720&r1=1365719&r2=1365720&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
(original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
Wed Jul 25 20:27:33 2012
@@ -206,13 +206,13 @@ public class ServerManager {
     String hostAndPort = info.getServerAddress().toString();
     HServerInfo existingServer = haveServerWithSameHostAndPortAlready(info.getHostnamePort());
     if (existingServer != null) {
-      String message = "Server start rejected; we already have " + hostAndPort +
-        " registered; existingServer=" + existingServer + ", newServer=" + info;
+      String message = "Duplicate regionserver check-in for host/port: " + hostAndPort +
+          "; existingServer=" + existingServer + ", newServer=" + info;
       LOG.info(message);
       long existingStartCode = existingServer.getStartCode();
       long newStartCode = info.getStartCode();
       if (existingStartCode < newStartCode) {
-        LOG.info("Triggering server recovery; existingServer looks stale");
+        LOG.info("Existing regionserver looks stale, expiring: " + existingServer);
         expireServer(existingServer);
       } else if (existingStartCode == newStartCode) {
         LOG.debug("Duplicate region server check-in with start code " + existingStartCode
+ ": " +

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java?rev=1365720&r1=1365719&r2=1365720&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
(original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
Wed Jul 25 20:27:33 2012
@@ -48,9 +48,9 @@ import java.util.TreeMap;
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.Callable;
 import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.Executors;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
 import java.util.concurrent.LinkedBlockingQueue;
 import java.util.concurrent.SynchronousQueue;
@@ -73,7 +73,6 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hbase.Abortable;
 import org.apache.hadoop.hbase.Chore;
-import org.apache.hadoop.hbase.DoNotRetryIOException;
 import org.apache.hadoop.hbase.HBaseConfiguration;
 import org.apache.hadoop.hbase.HConstants;
 import org.apache.hadoop.hbase.HConstants.OperationStatusCode;
@@ -96,18 +95,12 @@ import org.apache.hadoop.hbase.Stoppable
 import org.apache.hadoop.hbase.UnknownRowLockException;
 import org.apache.hadoop.hbase.UnknownScannerException;
 import org.apache.hadoop.hbase.YouAreDeadException;
-import org.apache.hadoop.hbase.HConstants.OperationStatusCode;
-import org.apache.hadoop.hbase.HMsg.Type;
-import org.apache.hadoop.hbase.Leases.LeaseStillHeldException;
-import org.apache.hadoop.hbase.client.MultiAction;
-import org.apache.hadoop.hbase.client.MultiResponse;
-import org.apache.hadoop.hbase.client.Mutation;
-import org.apache.hadoop.hbase.client.Row;
-import org.apache.hadoop.hbase.client.RowMutations;
 import org.apache.hadoop.hbase.client.Delete;
 import org.apache.hadoop.hbase.client.Get;
+import org.apache.hadoop.hbase.client.MultiAction;
 import org.apache.hadoop.hbase.client.MultiPut;
 import org.apache.hadoop.hbase.client.MultiPutResponse;
+import org.apache.hadoop.hbase.client.MultiResponse;
 import org.apache.hadoop.hbase.client.Mutation;
 import org.apache.hadoop.hbase.client.Put;
 import org.apache.hadoop.hbase.client.Result;
@@ -119,8 +112,8 @@ import org.apache.hadoop.hbase.io.hfile.
 import org.apache.hadoop.hbase.io.hfile.LruBlockCache;
 import org.apache.hadoop.hbase.io.hfile.LruBlockCache.CacheStats;
 import org.apache.hadoop.hbase.ipc.HBaseRPC;
-import org.apache.hadoop.hbase.ipc.HBaseRPCOptions;
 import org.apache.hadoop.hbase.ipc.HBaseRPCErrorHandler;
+import org.apache.hadoop.hbase.ipc.HBaseRPCOptions;
 import org.apache.hadoop.hbase.ipc.HBaseRPCProtocolVersion;
 import org.apache.hadoop.hbase.ipc.HBaseServer;
 import org.apache.hadoop.hbase.ipc.HMasterRegionInterface;
@@ -138,6 +131,7 @@ import org.apache.hadoop.hbase.util.Byte
 import org.apache.hadoop.hbase.util.DaemonThreadFactory;
 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
 import org.apache.hadoop.hbase.util.FSUtils;
+import org.apache.hadoop.hbase.util.HasThread;
 import org.apache.hadoop.hbase.util.InfoServer;
 import org.apache.hadoop.hbase.util.Pair;
 import org.apache.hadoop.hbase.util.ParamFormat;
@@ -156,7 +150,6 @@ import org.apache.zookeeper.KeeperExcept
 import org.apache.zookeeper.WatchedEvent;
 import org.apache.zookeeper.Watcher;
 import org.apache.zookeeper.Watcher.Event.EventType;
-import org.apache.hadoop.hbase.util.HasThread;
 
 import com.google.common.base.Preconditions;
 
@@ -308,7 +301,6 @@ public class HRegionServer implements HR
   private final HServerAddress address;
 
   // The main region server thread.
-  @SuppressWarnings("unused")
   private Thread regionServerThread;
 
   private final String machineName;
@@ -445,6 +437,9 @@ public class HRegionServer implements HR
     this.serverInfo = new HServerInfo(new HServerAddress(
       new InetSocketAddress(address.getBindAddress(), port)),
       System.currentTimeMillis(), machineName);
+    if (regionServerThread != null) {
+      Threads.renameThread(regionServerThread, getRSThreadName());
+    }
     if (this.serverInfo.getServerAddress() == null) {
       throw new NullPointerException("Server address cannot be null; " +
         "hbase-958 debugging");
@@ -571,6 +566,7 @@ public class HRegionServer implements HR
   @Override
   public void run() {
     regionServerThread = Thread.currentThread();
+    Threads.renameThread(regionServerThread, getRSThreadName());
     boolean quiesceRequested = false;
     try {
       MapWritable w = null;
@@ -1406,7 +1402,7 @@ public class HRegionServer implements HR
    */
   private void startServiceThreads() throws IOException {
     HBaseRPC.startProxy();
-    String n = Thread.currentThread().getName();
+    String n = getRSThreadName();
     UncaughtExceptionHandler handler = new UncaughtExceptionHandler() {
       @Override
       public void uncaughtException(Thread t, Throwable e) {
@@ -1451,9 +1447,6 @@ public class HRegionServer implements HR
           port++;
         }
       }
-      // update HRS server info port.
-      this.serverInfo = new HServerInfo(this.serverInfo.getServerAddress(),
-        this.serverInfo.getStartCode(), this.serverInfo.getHostname());
     }
 
     this.replicationHandler.startReplicationServices();
@@ -2730,10 +2723,10 @@ public class HRegionServer implements HR
   }
 
   /**
-   * @return true if a stop has been requested.
+   * @return true if a stop or abort has been requested.
    */
   public boolean isStopRequested() {
-    return this.stopRequested.get();
+    return this.stopRequested.get() || abortRequested;
   }
 
   /**
@@ -3115,7 +3108,6 @@ public class HRegionServer implements HR
     return serverInfo;
   }
 
-  @SuppressWarnings("unchecked")
   @Override
   public MultiResponse multiAction(MultiAction mActions) throws IOException {
     checkOpen();
@@ -3422,4 +3414,9 @@ public class HRegionServer implements HR
     return null;
   }
 
+  /** @return what the regionserver thread name should be */
+  public String getRSThreadName() {
+    return "RS-" + serverInfo.getServerName();
+  }
+
 }

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/Threads.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/Threads.java?rev=1365720&r1=1365719&r2=1365720&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/Threads.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/Threads.java Wed Jul
25 20:27:33 2012
@@ -201,4 +201,12 @@ public class Threads {
     boundedCachedThreadPool.allowCoreThreadTimeOut(true);
     return boundedCachedThreadPool;
   }
+
+  public static void renameThread(Thread t, String newName) {
+    String oldName = t.getName();
+    if (!t.equals(newName)) {
+      LOG.info("Thread '" + oldName + "' is now known as '" + newName + "'");
+      t.setName(newName);
+    }
+  }
 }

Modified: hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestMasterTransitions.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestMasterTransitions.java?rev=1365720&r1=1365719&r2=1365720&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestMasterTransitions.java
(original)
+++ hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestMasterTransitions.java
Wed Jul 25 20:27:33 2012
@@ -261,6 +261,7 @@ public class TestMasterTransitions {
    */
   @Test (timeout=300000) public void testAddingServerBeforeOldIsDead2413()
   throws IOException {
+    HBaseTestingUtility.setThreadNameFromMethod();
     LOG.info("Running testAddingServerBeforeOldIsDead2413");
     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
     int count = count();
@@ -279,21 +280,43 @@ public class TestMasterTransitions {
       HRegionServer hrs = null;
       while (true) {
         try {
+          LOG.info("A new attempt to start a regionserver on port " + port);
           hrs = cluster.startRegionServer().getRegionServer();
-          break;
+          Threads.sleepWithoutInterrupt(1000);
+          if (hrs.isOnline() && !hrs.isStopRequested()) {
+            // The regionserver has started successfully.
+            break;
+          }
         } catch (IOException e) {
           if (e.getCause() != null && e.getCause() instanceof InvocationTargetException)
{
             InvocationTargetException ee = (InvocationTargetException)e.getCause();
             if (ee.getCause() != null && ee.getCause() instanceof BindException)
{
               LOG.info("BindException; retrying: " + e.toString());
             }
+          } else {
+            // Unknown error when constructing the regionserver, fail the unit test.
+            throw e;
           }
         }
+        // Bind exception, or the regionserver failed to start. Retry.
       }
       LOG.info("STARTED=" + hrs);
-      // Wait until he's been given at least 3 regions before we go on to try
+
+      // Wait until the regionserver has been given at least 3 regions before we go on to
try
       // and count rows in table.
-      while (hrs.getOnlineRegions().size() < 3) Threads.sleep(100);
+      int requiredNumRegions = 3;
+      int numOnlineRegions;
+      long lastLogTime = 0;
+      while ((numOnlineRegions = hrs.getOnlineRegions().size()) < requiredNumRegions)
{
+        Threads.sleep(100);
+        long now = System.currentTimeMillis();
+        if (now - lastLogTime > 5 * 1000) {
+          LOG.debug("Waiting for " + requiredNumRegions + " to be assigned to " +
+              hrs.getServerInfo().getServerName() + ", currently: " + numOnlineRegions);
+          lastLogTime = now;
+        }
+      }
+
       LOG.info(hrs.toString() + " has " + hrs.getOnlineRegions().size() +
         " regions");
       assertEquals(count, count());



Mime
View raw message