hbase-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mbau...@apache.org
Subject svn commit: r1308651 [1/2] - in /hbase/branches/0.89-fb/src: main/java/org/apache/hadoop/hbase/ main/java/org/apache/hadoop/hbase/master/ main/java/org/apache/hadoop/hbase/regionserver/ main/java/org/apache/hadoop/hbase/regionserver/wal/ main/java/org/...
Date Tue, 03 Apr 2012 02:04:27 GMT
Author: mbautin
Date: Tue Apr  3 02:04:26 2012
New Revision: 1308651

URL: http://svn.apache.org/viewvc?rev=1308651&view=rev
Log:
[master] Make log splitting and master failover tests more stable

Summary:
I have noticed flakiness in TestLogSplitOnMasterFailover, and while addressing
it, made some more improvements to simulated regionserver and master killing and
daemon isolation in unit test:

* Making TestLogSplitOnMasterFailover itself more stable by waiting until all
regions are online before proceeding with the second half of data loading.
Making behavior more consistent by waiting until the master starts splitting
dead regionserver's logs before killing the master itself. In this case, the
backup master always has to split a log directory with a "-splitting" suffix.
* Making HBaseTestingUtility.waitUntilAllRegionsAssigned more robust by
contacting regionservers and verifying that they have opened the regions META
says they have opened.
* Adding an option to use a separate DFS client in each master and regionserver,
so that datanodes marked as bad in one client do not affect others.
* Shutting down the log syncer thread on a simulated regionserver kill.

Test Plan:
Run all unit tests once.
Run all tests whose name contains "LogSplit" in a loop overnight.

Reviewers: pkhemani, liyintang, kranganathan, pritamdamania

Reviewed By: pkhemani

Differential Revision: https://reviews.facebook.net/D2325

Added:
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/StopStatus.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/Stoppable.java
      - copied, changed from r1307631, hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/Stoppable.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/StoppableImpl.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/StoppableMaster.java
Removed:
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/Stoppable.java
Modified:
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/Chore.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/HConstants.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/BaseScanner.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/MetaScanner.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/OldLogsCleaner.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RegionManager.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RegionServerOperationQueue.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RetryableMetaOperation.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RootScanner.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ZKMasterAddressWatcher.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/ShutdownHook.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/wal/HLog.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/JVMClusterUtil.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/RackManager.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/Sleeper.java
    hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java
    hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java
    hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/MultiMasterTest.java
    hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestClusterStartupDetection.java
    hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitAtStartup.java
    hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestLogSplitOnMasterFailover.java
    hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java
    hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestOldLogsCleaner.java
    hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestRegionServerOperationQueue.java
    hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestRegionStateOnMasterFailure.java
    hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestSplitLogManager.java

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/Chore.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/Chore.java?rev=1308651&r1=1308650&r2=1308651&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/Chore.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/Chore.java Tue Apr  3 02:04:26 2012
@@ -19,8 +19,6 @@
  */
 package org.apache.hadoop.hbase;
 
-import java.util.concurrent.atomic.AtomicBoolean;
-
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.hbase.util.Sleeper;
@@ -39,12 +37,7 @@ public abstract class Chore extends Thre
   private final Log LOG = LogFactory.getLog(this.getClass());
   private final Sleeper sleeper;
 
-  /**
-   * This variable might belong to someone else, e.g. HMaster. Setting this
-   * variable might trigger cluster shutdown. To shut down this thread only,
-   * use {@link #threadShouldStop}.
-   */
-  protected volatile AtomicBoolean stop;
+  private Stoppable stop;
 
   /**
    * Unlike {@link #stop}, this indicates that the current thread should shut
@@ -59,7 +52,7 @@ public abstract class Chore extends Thre
    * @param s When this flag is set to true, this thread will cleanup and exit
    * cleanly.
    */
-  public Chore(String name, final int p, final AtomicBoolean s) {
+  public Chore(String name, final int p, final Stoppable s) {
     super(name);
     this.sleeper = new Sleeper(p, s);
     this.stop = s;
@@ -72,7 +65,7 @@ public abstract class Chore extends Thre
   public void run() {
     try {
       boolean initialChoreComplete = false;
-      while (!this.stop.get() && !threadShouldStop) {
+      while (!stop.isStopped() && !threadShouldStop) {
         long startTime = System.currentTimeMillis();
         try {
           if (!initialChoreComplete) {
@@ -82,15 +75,15 @@ public abstract class Chore extends Thre
           }
         } catch (Exception e) {
           LOG.error("Caught exception", e);
-          if (this.stop.get()) {
+          if (stop.isStopped()) {
             continue;
           }
         }
         this.sleeper.sleep(startTime);
       }
     } catch (Throwable t) {
-      LOG.fatal("Caught error. Starting shutdown.", t);
-      this.stop.set(true);
+      LOG.fatal("Caught error. Shutting down the master.", t);
+      stop.stop("Error in " + getName() + ": " + t.getMessage());
     } finally {
       LOG.info(getName() + " exiting");
     }

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/HConstants.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/HConstants.java?rev=1308651&r1=1308650&r2=1308651&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/HConstants.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/HConstants.java Tue Apr  3 02:04:26 2012
@@ -466,7 +466,10 @@ public final class HConstants {
   /** The number of favored nodes for each region */
   public static final int FAVORED_NODES_NUM = 3;
 
-  public static final String UNKOWN_RACK = "Unkown Rack";
+  public static final String UNKNOWN_RACK = "Unknown Rack";
+
+  /** Delay when waiting for a variable (HBASE-4712) */
+  public static final int VARIABLE_WAIT_TIME_MS = 40;
 
   private HConstants() {
     // Can't be instantiated with this ctor.

Added: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/StopStatus.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/StopStatus.java?rev=1308651&view=auto
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/StopStatus.java (added)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/StopStatus.java Tue Apr  3 02:04:26 2012
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.hadoop.hbase;
+
+public interface StopStatus {
+
+  /** @return true if the service has been stopped */
+  public boolean isStopped();
+
+}

Copied: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/Stoppable.java (from r1307631, hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/Stoppable.java)
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/Stoppable.java?p2=hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/Stoppable.java&p1=hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/Stoppable.java&r1=1307631&r2=1308651&rev=1308651&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/Stoppable.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/Stoppable.java Tue Apr  3 02:04:26 2012
@@ -17,22 +17,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.hadoop.hbase.regionserver;
+package org.apache.hadoop.hbase;
 
 /**
- * Implementations are stoppable.
+ * Implementers are Stoppable.
  */
-interface Stoppable {
-  // Starting small, just doing a stoppable/stop for now and keeping it package
-  // protected for now.  Needed so don't have to pass RegionServer instance
-  // everywhere.  Doing Lifecycle seemed a stretch since none of our servers
-  // do natural start/stop, etc. RegionServer is hosted in a Thread (can't do
-  // 'stop' on a Thread and 'start' has special meaning for Threads) and then
-  // Master is implemented differently again (it is a Thread itself). We
-  // should move to redoing Master and RegionServer servers to use Spring or
-  // some such container but for now, I just need stop -- St.Ack.
+public interface Stoppable extends StopStatus {
   /**
-   * Stop service.
+   * Stop this service.
+   * @param why Why we're stopping.
    */
-  public void stop();
+  public void stop(String why);
+
+  /** @return why we are stopping */
+  String getStopReason();
 }

Added: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/StoppableImpl.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/StoppableImpl.java?rev=1308651&view=auto
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/StoppableImpl.java (added)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/StoppableImpl.java Tue Apr  3 02:04:26 2012
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.hadoop.hbase;
+
+public class StoppableImpl implements Stoppable {
+
+  private String stopReason;
+  private volatile boolean stopped;
+
+  @Override
+  public void stop(String why) {
+    stopReason = why;
+    stopped = true;
+  }
+
+  @Override
+  public boolean isStopped() {
+    return stopped;
+  }
+
+  public String getStopReason() {
+    return stopReason;
+  }
+}

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/BaseScanner.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/BaseScanner.java?rev=1308651&r1=1308650&r2=1308651&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/BaseScanner.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/BaseScanner.java Tue Apr  3 02:04:26 2012
@@ -131,11 +131,11 @@ abstract class BaseScanner extends Chore
   // mid-scan
   final Object scannerLock = new Object();
 
-  BaseScanner(final HMaster master, final boolean rootRegion,
-      final AtomicBoolean stop) {
+  BaseScanner(final HMaster master, final boolean rootRegion) {
     super("Scanner for " + (rootRegion ? "-ROOT-":".META.") + " table",
         master.getConfiguration().
-        getInt("hbase.master.meta.thread.rescanfrequency", 60 * 1000), stop);
+        getInt("hbase.master.meta.thread.rescanfrequency", 60 * 1000),
+        master);
     this.rootRegion = rootRegion;
     this.master = master;
     this.initialScanComplete = false;

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1308651&r1=1308650&r2=1308651&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Tue Apr  3 02:04:26 2012
@@ -41,6 +41,7 @@ import java.util.concurrent.ThreadFactor
 import java.util.concurrent.ThreadPoolExecutor;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicReference;
 import java.util.concurrent.locks.Lock;
 import java.util.concurrent.locks.ReentrantLock;
@@ -74,6 +75,8 @@ import org.apache.hadoop.hbase.LocalHBas
 import org.apache.hadoop.hbase.MasterNotRunningException;
 import org.apache.hadoop.hbase.MiniZooKeeperCluster;
 import org.apache.hadoop.hbase.RemoteExceptionHandler;
+import org.apache.hadoop.hbase.StopStatus;
+import org.apache.hadoop.hbase.Stoppable;
 import org.apache.hadoop.hbase.TableExistsException;
 import org.apache.hadoop.hbase.client.Get;
 import org.apache.hadoop.hbase.client.HBaseAdmin;
@@ -131,7 +134,7 @@ import com.google.common.collect.Lists;
  * @see Watcher
  */
 public class HMaster extends Thread implements HMasterInterface,
-    HMasterRegionInterface, Watcher {
+    HMasterRegionInterface, Watcher, StoppableMaster {
   // MASTER is name of the webapp and the attribute name used stuffing this
   //instance into web context.
   public static final String MASTER = "master";
@@ -151,8 +154,8 @@ public class HMaster extends Thread impl
   /**
    * This flag indicates that cluster shutdown has been requested. This is
    * different from {@link #closed} in that it is initially false, but is
-   * set to true at shutdown and remains true from then on. For killing one
-   * instance of the master, see {@link #killed}.
+   * set to true at shutdown and remains true from then on. For stopping one
+   * instance of the master, see {@link #stopped}.
    */
   private final AtomicBoolean clusterShutdownRequested =
       new AtomicBoolean(false);
@@ -203,8 +206,8 @@ public class HMaster extends Thread impl
   private long applyPreferredAssignmentPeriod = 0l;
   private long holdRegionForBestLocalityPeriod = 0l;
 
-  /** True if the master is being killed. No cluster shutdown is done. */
-  private volatile boolean killed = false;
+  /** True if the master is being stopped. No cluster shutdown is done. */
+  private volatile boolean stopped = false;
 
   /** Flag set after we become the active master (used for testing). */
   private volatile boolean isActiveMaster = false;
@@ -219,6 +222,16 @@ public class HMaster extends Thread impl
   private boolean shouldAssignRegionsWithFavoredNodes = false;
 
   /**
+   * The number of dead server log split requests received. This is not
+   * incremented during log splitting on startup. This field is never
+   * decremented. Used in unit tests.
+   */
+  private final AtomicInteger numDeadServerLogSplitRequests =
+      new AtomicInteger();
+
+  private String stopReason = "not stopping";
+
+  /**
    * Constructor
    * @param conf configuration
    * @throws IOException
@@ -246,7 +259,7 @@ public class HMaster extends Thread impl
     this.threadWakeFrequency = conf.getInt(HConstants.THREAD_WAKE_FREQUENCY,
         10 * 1000);
 
-    this.sleeper = new Sleeper(this.threadWakeFrequency, this.closed);
+    this.sleeper = new Sleeper(this.threadWakeFrequency, getStopper());
     this.connection = ServerConnectionManager.getConnection(conf);
 
     // hack! Maps DFSClient => Master for logs.  HDFS made this
@@ -283,7 +296,7 @@ public class HMaster extends Thread impl
     // TODO: Bring up the UI to redirect to active Master.
     zooKeeperWrapper.registerListener(this);
     this.zkMasterAddressWatcher =
-      new ZKMasterAddressWatcher(this.zooKeeperWrapper, this.clusterShutdownRequested);
+        new ZKMasterAddressWatcher(this.zooKeeperWrapper, this);
     zooKeeperWrapper.registerListener(zkMasterAddressWatcher);
 
     // if we're a backup master, stall until a primary to writes his address
@@ -306,7 +319,8 @@ public class HMaster extends Thread impl
       }
     }
 
-    // initilize the thread pool for log splitting.
+    final String masterName = getServerName();
+    // initialize the thread pool for non-distributed log splitting.
     int maxSplitLogThread =
       conf.getInt("hbase.master.splitLogThread.max", 1000);
     logSplitThreadPool = Threads.getBoundedCachedThreadPool(
@@ -314,7 +328,8 @@ public class HMaster extends Thread impl
         new ThreadFactory() {
           private int count = 1;
           public Thread newThread(Runnable r) {
-            Thread t = new Thread(r, "LogSplittingThread" + "-" + count++);
+            Thread t = new Thread(r, masterName + "-LogSplittingThread" + "-"
+                + count++);
             if (!t.isDaemon())
               t.setDaemon(true);
             return t;
@@ -351,7 +366,7 @@ public class HMaster extends Thread impl
     isActiveMaster = true;
 
     this.regionServerOperationQueue =
-      new RegionServerOperationQueue(this.conf, this.closed);
+      new RegionServerOperationQueue(this.conf, getClosedStatus());
 
     synchronized(this) {
       serverManager = new ServerManager(this);
@@ -432,12 +447,12 @@ public class HMaster extends Thread impl
 
             @Override
             public void abort(String why, Throwable e) {
-              killMaster();
+              stop("ZK session expired");
             }
 
             @Override
             public boolean isAborted() {
-              return isKilled();
+              return stopped;
             }
           });
     }
@@ -571,7 +586,7 @@ public class HMaster extends Thread impl
         FSUtils.checkFileSystemAvailable(this.fs);
       } catch (IOException e) {
         LOG.fatal("Shutting down HBase cluster: file system not available", e);
-        this.closed.set(true);
+        shutdownClusterNow();
         this.fsOk = false;
       }
     }
@@ -635,10 +650,6 @@ public class HMaster extends Thread impl
     return this.fs;
   }
 
-  public AtomicBoolean getClusterShutdownRequested() {
-    return this.clusterShutdownRequested;
-  }
-
   AtomicBoolean getClosed() {
     return this.closed;
   }
@@ -752,27 +763,15 @@ public class HMaster extends Thread impl
         }
       }
     } catch (Throwable t) {
-      LOG.fatal("Unhandled exception. Starting shutdown.", t);
+      LOG.fatal("Unhandled exception. Starting cluster shutdown.", t);
       startupStatus.cleanup();
-      this.closed.set(true);
+      shutdownClusterNow();
     }
+    closed.set(true);
 
-    startupStatus.cleanup();
-    if (!this.clusterShutdownRequested.get()) {  // shutdown not by request
-      if (!killed) {
-        // This would trigger a cluster shutdown, so we are not doing this when
-        // the master is being "killed" in a unit test.
-        shutdown();
-      }
-
-      // Get started with shutdown: stop scanners, etc. This does not lead to
-      // a cluster shutdown.
-      startShutdown();
-    } else if (killed) {
-      startShutdown();
-    }
+    startShutdown();
 
-    if (!killed) {
+    if (clusterShutdownRequested.get()) {
       // Wait for all the remaining region servers to report in. Only doing
       // this when the cluster is shutting down.
       this.serverManager.letRegionServersShutdown();
@@ -792,9 +791,10 @@ public class HMaster extends Thread impl
     }
     this.rpcServer.stop();
 
+    logSplitThreadPool.shutdown();
+
     regionManager.joinThreads();
-    if (!killed) {
-      // We are shutting down the cluster.
+    if (clusterShutdownRequested.get()) {
       zooKeeperWrapper.clearRSDirectory();
     }
 
@@ -1053,7 +1053,9 @@ public class HMaster extends Thread impl
     return (isSplitLogAfterStartupDone.get());
   }
 
-  public void splitLog(final String serverName) throws IOException {
+  public void splitDeadServerLog(final String serverName) throws IOException {
+    // Maintain the number of dead server split log requests for testing.
+    numDeadServerLogSplitRequests.incrementAndGet();
     List<String> serverNames = new ArrayList<String>();
     serverNames.add(serverName);
     splitLog(serverNames);
@@ -1107,7 +1109,7 @@ public class HMaster extends Thread impl
         this.splitLogLock.lock();
         try {
           HLog.splitLog(this.rootdir, logDir, oldLogDir, this.fs,
-              getConfiguration());
+              getConfiguration(), HLog.DEFAULT_LATEST_TS_TO_INCLUDE, this);
         } finally {
           this.splitLogLock.unlock();
         }
@@ -1119,6 +1121,20 @@ public class HMaster extends Thread impl
     }
   }
 
+  /**
+   * @return true if the master is shutting down (with or without shutting down
+   *         the cluster)
+   */
+  @Override
+  public boolean isStopped() {
+    return stopped;
+  }
+
+  @Override
+  public String getStopReason() {
+    return stopReason;
+  }
+
   /*
    * Start up all services. If any of these threads gets an unhandled exception
    * then they just die with a logged message.  This should be fine because
@@ -1141,7 +1157,7 @@ public class HMaster extends Thread impl
         // splitLogManager must be started before starting rpcServer because
         // region-servers dying will trigger log splitting
         this.splitLogManager = new SplitLogManager(zooKeeperWrapper, conf,
-            this.clusterShutdownRequested, address.toString());
+            getStopper(), address.toString());
         this.splitLogManager.finishInitialization();
       }
       // Start the server so that region servers are running before we start
@@ -1174,6 +1190,9 @@ public class HMaster extends Thread impl
     this.regionManager.stopScanners();
     this.regionServerOperationQueue.shutdown();
     this.serverManager.notifyServers();
+    if (splitLogManager != null) {
+      splitLogManager.stop();
+    }
   }
 
   @Override
@@ -1232,14 +1251,38 @@ public class HMaster extends Thread impl
     return !this.closed.get();
   }
 
+  /**
+   * This method's name should indicate that it will shut down the whole
+   * cluster, but renaming it may break client/server compatibility on
+   * upgrades.
+   */
   @Override
   public void shutdown() {
-    LOG.info("Cluster shutdown requested. Starting to quiesce servers");
-    this.clusterShutdownRequested.set(true);
-    this.zooKeeperWrapper.setClusterState(false);
-    if (splitLogManager != null) {
-      this.splitLogManager.stop();
+    requestClusterShutdown();
+  }
+
+  /**
+   * Request a shutdown the whole HBase cluster. This only modifies state
+   * flags in memory and in ZK, so it is safe to be called multiple times.
+   */
+  public void requestClusterShutdown() {
+    if (!clusterShutdownRequested.compareAndSet(false, true)) {
+      // Only request cluster shutdown once.
+      return;
     }
+
+    if (!closed.get()) {
+      LOG.info("Cluster shutdown requested. Starting to quiesce servers");
+    }
+    this.zooKeeperWrapper.setClusterState(false);
+    stopped = true;
+    stopReason = "cluster shutdown";
+  }
+
+  /** Shutdown the cluster quickly, don't quiesce regionservers */
+  private void shutdownClusterNow() {
+    closed.set(true);
+    requestClusterShutdown();
   }
 
   @Override
@@ -1994,7 +2037,12 @@ public class HMaster extends Thread impl
   }
 
   public String getServerName() {
-    return address.toString();
+    return HMaster.class.getSimpleName() + "-" + address.toString();
+  }
+
+  @Override
+  public String toString() {
+    return getServerName();
   }
 
   /**
@@ -2011,9 +2059,20 @@ public class HMaster extends Thread impl
     LOG.info("Cleared region " + region + " from transition map");
   }
 
-  public void stopMaster() {
-    LOG.info("Master stop requested, isActiveMaster=" + isActiveMaster);
+  /**
+   * Stop master without shutting down the cluster. Gets out of the master loop
+   * quickly. Does not quiesce regionservers.
+   */
+  @Override
+  public void stop(String why) {
+    LOG.info("Master stop requested, isActiveMaster=" + isActiveMaster
+        + ", reason=" + why);
+    stopped = true;
+    stopReason = why;
+
+    // Get out of the master loop.
     closed.set(true);
+
     // If we are a backup master, we need to interrupt wait
     if (!isActiveMaster) {
       zkMasterAddressWatcher.cancelMasterZNodeWait();
@@ -2026,18 +2085,8 @@ public class HMaster extends Thread impl
     }
   }
 
-  public void killMaster() {
-    LOG.info("Killing master without shutting down the cluster");
-    killed = true;
-    stopMaster();
-  }
-
-  public boolean isKilled() {
-    return killed;
-  }
-
   String getZKWrapperName() {
-    return getClass().getSimpleName() + "-" + getServerName();
+    return getServerName();
   }
 
   public SplitLogManager getSplitLogManager() {
@@ -2048,4 +2097,25 @@ public class HMaster extends Thread impl
     return logDirsSplitOnStartup;
   }
 
+  int getNumDeadServerLogSplitRequests() {
+    return numDeadServerLogSplitRequests.get();
+  }
+
+  public boolean isClusterShutdownRequested() {
+    return clusterShutdownRequested.get();
+  }
+
+  public StoppableMaster getStopper() {
+    return this;
+  }
+
+  public StopStatus getClosedStatus() {
+    return new StopStatus() {
+      @Override
+      public boolean isStopped() {
+        return closed.get();
+      }
+    };
+  }
+
 }

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/MetaScanner.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/MetaScanner.java?rev=1308651&r1=1308650&r2=1308651&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/MetaScanner.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/MetaScanner.java Tue Apr  3 02:04:26 2012
@@ -51,7 +51,7 @@ class MetaScanner extends BaseScanner {
    * @param master
    */
   public MetaScanner(HMaster master) {
-    super(master, false, master.getClusterShutdownRequested());
+    super(master, false);
   }
 
   // Don't retry if we get an error while scanning. Errors are most often

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/OldLogsCleaner.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/OldLogsCleaner.java?rev=1308651&r1=1308650&r2=1308651&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/OldLogsCleaner.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/OldLogsCleaner.java Tue Apr  3 02:04:26 2012
@@ -19,6 +19,8 @@
  */
 package org.apache.hadoop.hbase.master;
 
+import java.io.IOException;
+
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -28,12 +30,9 @@ import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hbase.Chore;
 import org.apache.hadoop.hbase.HConstants;
 import org.apache.hadoop.hbase.RemoteExceptionHandler;
+import org.apache.hadoop.hbase.Stoppable;
 import org.apache.hadoop.hbase.regionserver.wal.HLog;
 
-import java.io.IOException;
-import java.util.concurrent.atomic.AtomicBoolean;
-import java.util.regex.Pattern;
-
 /**
  * This Chore, everytime it runs, will clear the logs in the old logs folder
  * that are older than hbase.master.logcleaner.ttl and, in order to limit the
@@ -54,12 +53,12 @@ public class OldLogsCleaner extends Chor
   /**
    *
    * @param p the period of time to sleep between each run
-   * @param s the stopper boolean
+   * @param s the stopper
    * @param conf configuration to use
    * @param fs handle to the FS
    * @param oldLogDir the path to the archived logs
    */
-  public OldLogsCleaner(final int p, final AtomicBoolean s,
+  public OldLogsCleaner(final int p, final Stoppable s,
                         Configuration conf, FileSystem fs,
                         Path oldLogDir) {
     super("OldLogsCleaner", p, s);

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java?rev=1308651&r1=1308650&r2=1308651&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java Tue Apr  3 02:04:26 2012
@@ -292,7 +292,7 @@ class ProcessServerShutdown extends Regi
       @Override
       public void run() {
         try {
-          master.splitLog(deadServer);
+          master.splitDeadServerLog(deadServer);
           logSplitResult = LogSplitResult.SUCCESS;
         } catch (Exception e) {
           LOG.error(

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RegionManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RegionManager.java?rev=1308651&r1=1308650&r2=1308651&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RegionManager.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RegionManager.java Tue Apr  3 02:04:26 2012
@@ -224,7 +224,7 @@ public class RegionManager {
 
   void reassignRootRegion() {
     unsetRootRegion();
-    if (!master.getClusterShutdownRequested().get()) {
+    if (!master.isClusterShutdownRequested()) {
       synchronized (regionsInTransition) {
         String regionName = HRegionInfo.ROOT_REGIONINFO.getRegionNameAsString();
         byte[] data = null;
@@ -1519,8 +1519,7 @@ public class RegionManager {
    */
   public void waitForRootRegionLocation() {
     synchronized (rootRegionLocation) {
-      while (!master.getClusterShutdownRequested().get() &&
-          !master.isClosed() && rootRegionLocation.get() == null) {
+      while (!master.isStopped() && rootRegionLocation.get() == null) {
         // rootRegionLocation will be filled in when we get an 'open region'
         // regionServerReport message from the HRegionServer that has been
         // allocated the ROOT region below.

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RegionServerOperationQueue.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RegionServerOperationQueue.java?rev=1308651&r1=1308650&r2=1308651&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RegionServerOperationQueue.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RegionServerOperationQueue.java Tue Apr  3 02:04:26 2012
@@ -26,7 +26,6 @@ import java.util.concurrent.CopyOnWriteA
 import java.util.concurrent.DelayQueue;
 import java.util.concurrent.PriorityBlockingQueue;
 import java.util.concurrent.TimeUnit;
-import java.util.concurrent.atomic.AtomicBoolean;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -35,6 +34,7 @@ import org.apache.hadoop.hbase.HConstant
 import org.apache.hadoop.hbase.HMsg;
 import org.apache.hadoop.hbase.HServerInfo;
 import org.apache.hadoop.hbase.RemoteExceptionHandler;
+import org.apache.hadoop.hbase.StopStatus;
 import org.apache.hadoop.hbase.master.RegionServerOperation.RegionServerOperationResult;
 import org.apache.hadoop.hbase.util.Sleeper;
 import org.apache.hadoop.ipc.RemoteException;
@@ -53,6 +53,8 @@ public class RegionServerOperationQueue 
   // TODO: Build up the junit test of this class.
   private final Log LOG = LogFactory.getLog(this.getClass());
 
+  private final StopStatus stop;
+
   /**
    * Enums returned by {@link RegionServerOperationQueue#process()};
    */
@@ -91,13 +93,12 @@ public class RegionServerOperationQueue 
   private final Set<RegionServerOperationListener> listeners =
     new CopyOnWriteArraySet<RegionServerOperationListener>();
   private final int threadWakeFrequency;
-  private final AtomicBoolean closed;
   private final Sleeper sleeper;
 
-  RegionServerOperationQueue(final Configuration c, final AtomicBoolean closed) {
+  RegionServerOperationQueue(final Configuration c, StopStatus stop) {
     this.threadWakeFrequency = c.getInt(HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000);
-    this.closed = closed;
-    this.sleeper = new Sleeper(this.threadWakeFrequency, this.closed);
+    this.stop = stop;
+    this.sleeper = new Sleeper(this.threadWakeFrequency, stop);
   }
 
   public void put(final RegionServerOperation op) {
@@ -133,7 +134,7 @@ public class RegionServerOperationQueue 
 
     // At this point, if there's still no todo operation, or we're supposed to
     // be closed, return.
-    if (op == null || closed.get()) {
+    if (op == null || stop.isStopped()) {
       return ProcessingResultCode.NOOP;
     }
 

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RetryableMetaOperation.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RetryableMetaOperation.java?rev=1308651&r1=1308650&r2=1308651&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RetryableMetaOperation.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RetryableMetaOperation.java Tue Apr  3 02:04:26 2012
@@ -51,7 +51,7 @@ abstract class RetryableMetaOperation<T>
     this.m = m;
     this.master = master;
     this.sleeper = new Sleeper(this.master.getThreadWakeFrequency(),
-      this.master.getClosed());
+      master.getStopper());
   }
 
   protected T doWithRetries()

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RootScanner.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RootScanner.java?rev=1308651&r1=1308650&r2=1308651&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RootScanner.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RootScanner.java Tue Apr  3 02:04:26 2012
@@ -31,7 +31,7 @@ class RootScanner extends BaseScanner {
    * @param master
    */
   public RootScanner(HMaster master) {
-    super(master, true, master.getClusterShutdownRequested());
+    super(master, true);
   }
 
   /**

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java?rev=1308651&r1=1308650&r2=1308651&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java Tue Apr  3 02:04:26 2012
@@ -49,6 +49,7 @@ import org.apache.hadoop.hbase.HServerAd
 import org.apache.hadoop.hbase.HServerInfo;
 import org.apache.hadoop.hbase.HServerLoad;
 import org.apache.hadoop.hbase.PleaseHoldException;
+import org.apache.hadoop.hbase.Stoppable;
 import org.apache.hadoop.hbase.YouAreDeadException;
 import org.apache.hadoop.hbase.client.Get;
 import org.apache.hadoop.hbase.client.Result;
@@ -124,7 +125,7 @@ public class ServerManager {
    * TODO: Make this a metric; dump metrics into log.
    */
   class ServerMonitor extends Chore {
-    ServerMonitor(final int period, final AtomicBoolean stop) {
+    ServerMonitor(final int period, final Stoppable stop) {
       super("ServerMonitor", period, stop);
     }
 
@@ -167,14 +168,13 @@ public class ServerManager {
       60 * 1000);
     this.minimumServerCount = c.getInt("hbase.regions.server.count.min", 0);
     this.serverMonitorThread = new ServerMonitor(metaRescanInterval,
-      this.master.getClusterShutdownRequested());
+      master.getStopper());
     String n = Thread.currentThread().getName();
     Threads.setDaemonThreadRunning(this.serverMonitorThread,
       n + ".serverMonitor");
     this.oldLogCleaner = new OldLogsCleaner(
       c.getInt("hbase.master.meta.thread.rescanfrequency",60 * 1000),
-        this.master.getClusterShutdownRequested(), c,
-        master.getFileSystem(), master.getOldLogDir());
+        master.getStopper(), c, master.getFileSystem(), master.getOldLogDir());
     Threads.setDaemonThreadRunning(oldLogCleaner,
       n + ".oldLogCleaner");
     rackManager = new RackManager(c);
@@ -304,7 +304,7 @@ public class ServerManager {
         this.quiescedServers.incrementAndGet();
       }
     }
-    if (this.master.getClusterShutdownRequested().get()) {
+    if (this.master.isClusterShutdownRequested()) {
       if (quiescedServers.get() >= serversToServerInfo.size()) {
         // If the only servers we know about are meta servers, then we can
         // proceed with shutdown
@@ -322,7 +322,7 @@ public class ServerManager {
         return new HMsg [] {HMsg.REGIONSERVER_QUIESCE};
       }
     }
-    if (this.master.isClosed() && !master.isKilled()) {
+    if (this.master.isClosed() && master.isClusterShutdownRequested()) {
       // Tell server to shut down if we are shutting down.  This should
       // happen after check of MSG_REPORT_EXITING above, since region server
       // will send us one of these messages after it gets MSG_REGIONSERVER_STOP

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java?rev=1308651&r1=1308650&r2=1308651&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java Tue Apr  3 02:04:26 2012
@@ -31,7 +31,6 @@ import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentMap;
-import java.util.concurrent.atomic.AtomicBoolean;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -40,12 +39,12 @@ import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hbase.Chore;
+import org.apache.hadoop.hbase.Stoppable;
 import org.apache.hadoop.hbase.master.SplitLogManager.TaskFinisher.Status;
 import org.apache.hadoop.hbase.monitoring.MonitoredTask;
 import org.apache.hadoop.hbase.monitoring.TaskMonitor;
 import org.apache.hadoop.hbase.regionserver.SplitLogWorker;
 import org.apache.hadoop.hbase.regionserver.wal.HLogSplitter;
-import org.apache.hadoop.hbase.regionserver.wal.OrphanHLogAfterSplitException;
 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
 import org.apache.hadoop.hbase.util.Threads;
 import org.apache.hadoop.hbase.zookeeper.RecoverableZooKeeper;
@@ -93,7 +92,7 @@ import org.apache.zookeeper.data.Stat;
 public class SplitLogManager implements Watcher {
   private static final Log LOG = LogFactory.getLog(SplitLogManager.class);
 
-  private final AtomicBoolean stopper;
+  private final Stoppable stopper;
   private final String serverName;
   private final TaskFinisher taskFinisher;
   private FileSystem fs;
@@ -129,7 +128,7 @@ public class SplitLogManager implements 
    * @param service
    */
   public SplitLogManager(ZooKeeperWrapper zkw, final Configuration conf,
-      AtomicBoolean stopper, String serverName) {
+      Stoppable stopper, String serverName) {
     this(zkw, conf, stopper, serverName, new TaskFinisher() {
       @Override
       public Status finish(String workerName, String logfile) {
@@ -159,7 +158,7 @@ public class SplitLogManager implements 
    * @param tf task finisher
    */
   public SplitLogManager(ZooKeeperWrapper zkw, Configuration conf,
-      AtomicBoolean stopper, String serverName, TaskFinisher tf) {
+      Stoppable stopper, String serverName, TaskFinisher tf) {
     this.watcher = zkw;
     this.watcher.createZNodeIfNotExists(this.watcher.splitLogZNode, new byte[0],
         CreateMode.PERSISTENT, false /* set watch? */);
@@ -323,7 +322,7 @@ public class SplitLogManager implements 
               + " done=" + batch.done
               + " error=" + batch.error);
           batch.wait(100);
-          if (stopper.get()) {
+          if (stopper.isStopped()) {
             LOG.warn("Stopped while waiting for log splits to be completed");
             return;
           }
@@ -849,7 +848,7 @@ public class SplitLogManager implements 
   private class TimeoutMonitor extends Chore {
     private int reported_tot = -1;
     private int reported_unassigned = -1;
-    public TimeoutMonitor(final int period, AtomicBoolean stopper) {
+    public TimeoutMonitor(final int period, Stoppable stopper) {
       super("SplitLogManager Timeout Monitor", period, stopper);
     }
 
@@ -1019,6 +1018,13 @@ public class SplitLogManager implements 
     public void processResult(int rc, String path, Object ctx) {
       tot_mgr_node_delete_result.incrementAndGet();
       if (rc != 0) {
+        if (rc == KeeperException.Code.SESSIONEXPIRED.intValue()) {
+          // If we keep retrying, we will spin in an infinite loop until the master dies.
+          // Therefore, we just bail out in this case.
+          LOG.error("ZK session expired, cannot delete " + path + ". Master must be killed " +
+              "in this case.");
+          return;
+        }
         if (rc != KeeperException.Code.NONODE.intValue()) {
           tot_mgr_node_delete_err.incrementAndGet();
           Long retry_count = (Long) ctx;

Added: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/StoppableMaster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/StoppableMaster.java?rev=1308651&view=auto
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/StoppableMaster.java (added)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/StoppableMaster.java Tue Apr  3 02:04:26 2012
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.hadoop.hbase.master;
+
+import org.apache.hadoop.hbase.Stoppable;
+
+/** Allows to shut down an HBase cluster. Used in master. */
+public interface StoppableMaster extends Stoppable {
+  void requestClusterShutdown();
+  boolean isClusterShutdownRequested();
+}

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ZKMasterAddressWatcher.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ZKMasterAddressWatcher.java?rev=1308651&r1=1308650&r2=1308651&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ZKMasterAddressWatcher.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ZKMasterAddressWatcher.java Tue Apr  3 02:04:26 2012
@@ -22,6 +22,7 @@ package org.apache.hadoop.hbase.master;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.hbase.HServerAddress;
+import org.apache.hadoop.hbase.Stoppable;
 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWrapper;
 import org.apache.zookeeper.WatchedEvent;
 import org.apache.zookeeper.Watcher;
@@ -43,15 +44,16 @@ class ZKMasterAddressWatcher implements 
   private static final Log LOG = LogFactory.getLog(ZKMasterAddressWatcher.class);
 
   private ZooKeeperWrapper zookeeper;
-  private final AtomicBoolean requestShutdown;
+  private final StoppableMaster stoppable;
 
   /**
    * Create this watcher using passed ZooKeeperWrapper instance.
    * @param zk ZooKeeper
    * @param flag Flag to set to request shutdown.
    */
-  ZKMasterAddressWatcher(final ZooKeeperWrapper zk, final AtomicBoolean flag) {
-    this.requestShutdown = flag;
+  ZKMasterAddressWatcher(final ZooKeeperWrapper zk,
+      final StoppableMaster stoppable) {
+    this.stoppable = stoppable;
     this.zookeeper = zk;
   }
 
@@ -65,7 +67,7 @@ class ZKMasterAddressWatcher implements 
       if (event.getPath().equals(this.zookeeper.clusterStateZNode)) {
         LOG.info("Cluster shutdown while waiting, shutting down" +
           " this master.");
-        this.requestShutdown.set(true);
+        this.stoppable.requestClusterShutdown();
       } else if (event.getPath().equals(this.zookeeper.masterElectionZNode)){
         LOG.info("Master address ZNode deleted, notifying waiting masters");
         notifyAll();
@@ -90,7 +92,7 @@ class ZKMasterAddressWatcher implements 
    * blocks until the master address ZNode gets deleted.
    */
   private synchronized void waitForMasterAddressAvailability() {
-    while (!requestShutdown.get() &&
+    while (!stoppable.isStopped() &&
            zookeeper.readMasterAddress(zookeeper) != null) {
       try {
         LOG.debug("Waiting for master address ZNode to be deleted " +
@@ -113,7 +115,7 @@ class ZKMasterAddressWatcher implements 
     do {
       waitForMasterAddressAvailability();
       // Check if we need to shutdown instead of taking control
-      if (this.requestShutdown.get()) {
+      if (stoppable.isStopped()) {
         LOG.debug("Won't start Master because of requested shutdown");
         return false;
       }
@@ -137,7 +139,6 @@ class ZKMasterAddressWatcher implements 
   }
 
   synchronized void cancelMasterZNodeWait() {
-    requestShutdown.set(true);
     notifyAll();
   }
 

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java?rev=1308651&r1=1308650&r2=1308651&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java Tue Apr  3 02:04:26 2012
@@ -84,6 +84,8 @@ import org.apache.hadoop.hbase.Leases.Le
 import org.apache.hadoop.hbase.LocalHBaseCluster;
 import org.apache.hadoop.hbase.NotServingRegionException;
 import org.apache.hadoop.hbase.RemoteExceptionHandler;
+import org.apache.hadoop.hbase.StopStatus;
+import org.apache.hadoop.hbase.Stoppable;
 import org.apache.hadoop.hbase.UnknownRowLockException;
 import org.apache.hadoop.hbase.UnknownScannerException;
 import org.apache.hadoop.hbase.YouAreDeadException;
@@ -285,6 +287,8 @@ public class HRegionServer implements HR
   // Cache configuration and block cache reference
   private final CacheConfig cacheConfig;
 
+  private String stopReason = "not stopping";
+
   /**
    * Starts a HRegionServer at the default location
    * @param conf
@@ -320,7 +324,7 @@ public class HRegionServer implements HR
     this.msgInterval = conf.getInt("hbase.regionserver.msginterval",
         HConstants.REGION_SERVER_MSG_INTERVAL);
 
-    sleeper = new Sleeper(this.msgInterval, this.stopRequested);
+    sleeper = new Sleeper(this.msgInterval, this);
 
     this.maxScannerResultSize = conf.getLong(
             HConstants.HBASE_CLIENT_SCANNER_MAX_RESULT_SIZE_KEY,
@@ -421,7 +425,7 @@ public class HRegionServer implements HR
     int multiplier = this.conf.getInt(HConstants.THREAD_WAKE_FREQUENCY +
         ".multiplier", 1000);
     this.majorCompactionChecker = new MajorCompactionChecker(this,
-      this.threadWakeFrequency * multiplier,  this.stopRequested);
+      this.threadWakeFrequency * multiplier);
 
     this.leases = new Leases(
         (int) conf.getLong(HConstants.HBASE_REGIONSERVER_LEASE_PERIOD_KEY,
@@ -665,6 +669,7 @@ public class HRegionServer implements HR
 
     if (killed) {
       // Just skip out w/o closing regions.
+      hlog.kill();
     } else if (abortRequested) {
       if (this.fsOk) {
         // Only try to clean up if the file system is available
@@ -1004,8 +1009,8 @@ public class HRegionServer implements HR
     private final HRegionServer instance;
 
     MajorCompactionChecker(final HRegionServer h,
-        final int sleepTime, final AtomicBoolean stopper) {
-      super("MajorCompactionChecker", sleepTime, stopper);
+        final int sleepTime) {
+      super("MajorCompactionChecker", sleepTime, h);
       this.instance = h;
       LOG.info("Runs every " + sleepTime + "ms");
     }
@@ -1296,8 +1301,7 @@ public class HRegionServer implements HR
     if (!(leases.isAlive() &&
         cacheFlusher.isAlive() && hlogRoller.isAlive() &&
         workerThread.isAlive() && this.majorCompactionChecker.isAlive())) {
-      // One or more threads are no longer alive - shut down
-      stop();
+      stop("One or more threads are no longer alive");
       return false;
     }
     return true;
@@ -1352,8 +1356,9 @@ public class HRegionServer implements HR
    * in an orderly fashion.  Used by unit tests.
    */
   @Override
-  public void stop() {
+  public void stop(String why) {
     this.stopRequested.set(true);
+    stopReason = why;
     synchronized(this) {
       // Wakes run() if it is sleeping
       notifyAll(); // FindBugs NN_NAKED_NOTIFY
@@ -1367,7 +1372,7 @@ public class HRegionServer implements HR
   public void stopForRestart() {
     restartRequested = true;
     LOG.info("Going down for a restart");
-    stop();
+    stop("stop for restart");
   }
 
   /**
@@ -1390,7 +1395,7 @@ public class HRegionServer implements HR
     if (this.metrics != null) {
       LOG.info("Dump of metrics: " + this.metrics);
     }
-    stop();
+    stop("aborted: " + reason);
   }
 
   /**
@@ -3037,4 +3042,14 @@ public class HRegionServer implements HR
         HRegionServer.class);
     doMain(args, regionServerClass);
   }
+
+  @Override
+  public boolean isStopped() {
+    return stopRequested.get();
+  }
+
+  public String getStopReason() {
+    return stopReason;
+  }
+
 }

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/ShutdownHook.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/ShutdownHook.java?rev=1308651&r1=1308650&r2=1308651&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/ShutdownHook.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/ShutdownHook.java Tue Apr  3 02:04:26 2012
@@ -27,6 +27,7 @@ import org.apache.commons.logging.LogFac
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.hbase.HBaseConfiguration;
+import org.apache.hadoop.hbase.Stoppable;
 import org.apache.hadoop.hbase.util.Threads;
 
 /**
@@ -99,7 +100,7 @@ class ShutdownHook {
       LOG.info("Shutdown hook starting; " + RUN_SHUTDOWN_HOOK + "=" + b +
         "; fsShutdownHook=" + this.fsShutdownHook);
       if (b) {
-        this.stop.stop();
+        this.stop.stop("shutdown hook");
         Threads.shutdown(this.threadToJoin);
         if (this.fsShutdownHook != null) {
           LOG.info("Starting fs shutdown hook thread.");
@@ -198,9 +199,19 @@ class ShutdownHook {
   // Stoppable with nothing to stop.  Used below in main testing.
   static class DoNothingStoppable implements Stoppable {
     @Override
-    public void stop() {
+    public void stop(String why) {
       // Nothing to do.
     }
+
+    @Override
+    public boolean isStopped() {
+      return false;
+    }
+
+    @Override
+    public String getStopReason() {
+      return null;
+    }
   }
 
   /**

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/wal/HLog.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/wal/HLog.java?rev=1308651&r1=1308650&r2=1308651&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/wal/HLog.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/wal/HLog.java Tue Apr  3 02:04:26 2012
@@ -75,6 +75,7 @@ import org.apache.hadoop.hbase.HServerIn
 import org.apache.hadoop.hbase.HTableDescriptor;
 import org.apache.hadoop.hbase.KeyValue;
 import org.apache.hadoop.hbase.RemoteExceptionHandler;
+import org.apache.hadoop.hbase.Stoppable;
 import org.apache.hadoop.hbase.monitoring.MonitoredTask;
 import org.apache.hadoop.hbase.monitoring.TaskMonitor;
 import org.apache.hadoop.hbase.regionserver.HRegion;
@@ -141,6 +142,9 @@ public class HLog implements Syncable {
   private static final Pattern EDITFILES_NAME_PATTERN =
     Pattern.compile("-?[0-9]+");
 
+  /** We include all timestamps by default */
+  public static final long DEFAULT_LATEST_TS_TO_INCLUDE = Long.MAX_VALUE;
+
   private final FileSystem fs;
   private final Path dir;
   private final Configuration conf;
@@ -1296,23 +1300,14 @@ public class HLog implements Syncable {
   }
 
   /**
-   * Split up a bunch of regionserver commit log files that are no longer
-   * being written to, into new files, one per region for region to replay on
-   * startup. Delete the old log files when finished.
-   *
-   * @param rootDir qualified root directory of the HBase instance
-   * @param srcDir Directory of log files to split: e.g.
-   *                <code>${ROOTDIR}/log_HOST_PORT</code>
-   * @param oldLogDir directory where processed (split) logs will be archived to
-   * @param fs FileSystem
-   * @param conf Configuration
-   * @throws IOException will throw if corrupted hlogs aren't tolerated
-   * @return the list of splits
+   * @see the more general method {@link #splitLog(Path, Path, Path,
+   * FileSystem, Configuration, long, Stoppable)}
    */
   public static List<Path> splitLog(final Path rootDir, final Path srcDir,
     Path oldLogDir, final FileSystem fs, final Configuration conf)
   throws IOException {
-    return splitLog(rootDir, srcDir, oldLogDir, fs, conf, Long.MAX_VALUE);
+    return splitLog(rootDir, srcDir, oldLogDir, fs, conf,
+        DEFAULT_LATEST_TS_TO_INCLUDE, null);
   }
 
   /**
@@ -1332,7 +1327,7 @@ public class HLog implements Syncable {
    */
   public static List<Path> splitLog(final Path rootDir, final Path srcDir,
     Path oldLogDir, final FileSystem fs, final Configuration conf,
-    long maxWriteTime)
+    long maxWriteTime, Stoppable shutdownStatus)
   throws IOException {
     MonitoredTask status = TaskMonitor.get().createStatus(
         "Splitting logs in " + srcDir);
@@ -1354,7 +1349,8 @@ public class HLog implements Syncable {
       LOG.info("Splitting " + logfiles.length + " hlog(s) in " +
         srcDir.toString());
       status.setStatus("Performing split");
-      splits = splitLog(rootDir, srcDir, oldLogDir, logfiles, fs, conf, maxWriteTime);
+      splits = splitLog(rootDir, srcDir, oldLogDir, logfiles, fs, conf,
+          maxWriteTime, shutdownStatus);
       try {
         FileStatus[] files = fs.listStatus(srcDir);
         if (files == null) {
@@ -1438,12 +1434,13 @@ public class HLog implements Syncable {
    * @param fs
    * @param conf
    * @param maxWriteTime ignore entries with ts greater than this
+   * @param shutdownStatus a way to find out if the server is shutting down
    * @return
    * @throws IOException
    */
   private static List<Path> splitLog(final Path rootDir, final Path srcDir,
     Path oldLogDir, final FileStatus[] logfiles, final FileSystem fs,
-    final Configuration conf, long maxWriteTime)
+    final Configuration conf, long maxWriteTime, Stoppable shutdownStatus)
   throws IOException {
     List<Path> processedLogs = new ArrayList<Path>();
     List<Path> corruptedLogs = new ArrayList<Path>();
@@ -1472,6 +1469,7 @@ public class HLog implements Syncable {
           final Map<byte[], LinkedList<Entry>> editsByRegion =
             new TreeMap<byte[], LinkedList<Entry>>(Bytes.BYTES_COMPARATOR);
           for (int j = 0; j < logFilesPerStep; j++) {
+            checkForShutdown(shutdownStatus);
             i++;
             if (i == logfiles.length) {
               break;
@@ -1484,7 +1482,8 @@ public class HLog implements Syncable {
               ": " + logPath + ", length=" + logLength );
             try {
               recoverFileLease(fs, logPath, conf);
-              parseHLog(log, editsByRegion, fs, conf, maxWriteTime);
+              parseHLog(log, editsByRegion, fs, conf, maxWriteTime,
+                  shutdownStatus);
               processedLogs.add(logPath);
             } catch (EOFException eof) {
               // truncated files are expected if a RS crashes (see HBASE-2643)
@@ -1513,7 +1512,7 @@ public class HLog implements Syncable {
             }
           }
           writeEditsBatchToRegions(editsByRegion, logWriters,
-              rootDir, fs, conf);
+              rootDir, fs, conf, shutdownStatus);
         }
         Preconditions.checkNotNull(fs);
         Preconditions.checkNotNull(srcDir);
@@ -1660,7 +1659,8 @@ public class HLog implements Syncable {
   private static void writeEditsBatchToRegions(
     final Map<byte[], LinkedList<Entry>> splitLogsMap,
     final Map<byte[], WriterAndPath> logWriters,
-    final Path rootDir, final FileSystem fs, final Configuration conf)
+    final Path rootDir, final FileSystem fs, final Configuration conf,
+    Stoppable shutdownStatus)
   throws IOException {
     // Number of threads to use when log splitting to rewrite the logs.
     // More means faster but bigger mem consumption.
@@ -1669,12 +1669,15 @@ public class HLog implements Syncable {
     boolean skipErrors = conf.getBoolean("hbase.skip.errors", false);
     HashMap<byte[], Future> writeFutureResult = new HashMap<byte[], Future>();
     ThreadFactoryBuilder tfb = new ThreadFactoryBuilder();
-    tfb.setNameFormat("SplitWriter-%1$d");
+    tfb.setNameFormat((shutdownStatus == null ? "" : shutdownStatus + "-")
+        + "SplitWriter-%1$d");
     tfb.setThreadFactory(Executors.defaultThreadFactory());
     ThreadFactory f  = tfb.build();
-    ThreadPoolExecutor threadPool = (ThreadPoolExecutor)Executors.newFixedThreadPool(logWriterThreads, f);
+    ThreadPoolExecutor threadPool = (ThreadPoolExecutor)
+        Executors.newFixedThreadPool(logWriterThreads, f);
     for (final byte [] region : splitLogsMap.keySet()) {
-      Callable splitter = createNewSplitter(rootDir, logWriters, splitLogsMap, region, fs, conf);
+      Callable splitter = createNewSplitter(rootDir, logWriters, splitLogsMap,
+          region, fs, conf, shutdownStatus);
       writeFutureResult.put(region, threadPool.submit(splitter));
     }
 
@@ -1727,7 +1730,7 @@ public class HLog implements Syncable {
    */
   private static void parseHLog(final FileStatus logfile,
     final Map<byte[], LinkedList<Entry>> splitLogsMap, final FileSystem fs,
-    final Configuration conf, long maxWriteTime)
+    final Configuration conf, long maxWriteTime, Stoppable shutdownStatus)
   throws IOException {
     // Check for possibly empty file. With appends, currently Hadoop reports a
     // zero length even if the file has been sync'd. Revisit if HDFS-376 or
@@ -1756,6 +1759,7 @@ public class HLog implements Syncable {
     try {
       Entry entry;
       while ((entry = in.next()) != null) {
+        checkForShutdown(shutdownStatus);
         //Ignore entries that have a ts greater than maxWriteTime
         if (entry.getKey().getWriteTime() > maxWriteTime) continue;
         byte[] region = entry.getKey().getRegionName();
@@ -1782,7 +1786,8 @@ public class HLog implements Syncable {
   private static Callable<Void> createNewSplitter(final Path rootDir,
     final Map<byte[], WriterAndPath> logWriters,
     final Map<byte[], LinkedList<Entry>> logEntries,
-    final byte[] region, final FileSystem fs, final Configuration conf) {
+    final byte[] region, final FileSystem fs, final Configuration conf,
+    final Stoppable shutdownStatus) {
     return new Callable<Void>() {
       public String getName() {
         return "Split writer thread for region " + Bytes.toStringBinary(region);
@@ -1797,6 +1802,7 @@ public class HLog implements Syncable {
           int editsCount = 0;
           WriterAndPath wap = logWriters.get(region);
           for (Entry logEntry: entries) {
+            checkForShutdown(shutdownStatus);
             if (wap == null) {
               Path regionedits = getRegionSplitEditsPath(fs, logEntry, rootDir,
                   true);
@@ -2043,7 +2049,16 @@ public class HLog implements Syncable {
     }
   }
 
-  private static void split(final Configuration conf, final Path p)
+  private static void checkForShutdown(Stoppable shutdownStatus)
+      throws IOException {
+    if (shutdownStatus != null &&
+        shutdownStatus.isStopped()) {
+      throw new InterruptedIOException("Aborting log splitting: "
+          + shutdownStatus.getStopReason());
+    }
+  }
+
+  private static void splitFromCmdLine(final Configuration conf, final Path p)
   throws IOException {
     FileSystem fs = FileSystem.get(conf);
     if (!fs.exists(p)) {
@@ -2054,7 +2069,8 @@ public class HLog implements Syncable {
     if (!fs.getFileStatus(p).isDir()) {
       throw new IOException(p + " is not a directory");
     }
-    splitLog(baseDir, p, oldLogDir, fs, conf);
+    splitLog(baseDir, p, oldLogDir, fs, conf, DEFAULT_LATEST_TS_TO_INCLUDE,
+        null);
   }
 
   /**
@@ -2079,7 +2095,7 @@ public class HLog implements Syncable {
           conf.set("fs.default.name", args[i]);
           conf.set("fs.defaultFS", args[i]);
           Path logPath = new Path(args[i]);
-          split(conf, logPath);
+          splitFromCmdLine(conf, logPath);
         } catch (Throwable t) {
           t.printStackTrace(System.err);
           System.exit(-1);
@@ -2090,4 +2106,9 @@ public class HLog implements Syncable {
       System.exit(-1);
     }
   }
+
+  /** Used in a simulated kill of a regionserver */
+  public void kill() {
+    logSyncerThread.interrupt();
+  }
 }

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/JVMClusterUtil.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/JVMClusterUtil.java?rev=1308651&r1=1308650&r2=1308651&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/JVMClusterUtil.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/JVMClusterUtil.java Tue Apr  3 02:04:26 2012
@@ -175,7 +175,7 @@ public class JVMClusterUtil {
           t.shutdown();
         } else {
           // This will only stop this particular master.
-          t.stopMaster();
+          t.stop("normal shutdown");
         }
       }
     }

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/RackManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/RackManager.java?rev=1308651&r1=1308650&r2=1308651&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/RackManager.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/RackManager.java Tue Apr  3 02:04:26 2012
@@ -47,6 +47,6 @@ public class RackManager {
       return racks.get(0);
     }
 
-    return HConstants.UNKOWN_RACK;
+    return HConstants.UNKNOWN_RACK;
   }
 }

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/Sleeper.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/Sleeper.java?rev=1308651&r1=1308650&r2=1308651&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/Sleeper.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/Sleeper.java Tue Apr  3 02:04:26 2012
@@ -21,8 +21,7 @@ package org.apache.hadoop.hbase.util;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-
-import java.util.concurrent.atomic.AtomicBoolean;
+import org.apache.hadoop.hbase.StopStatus;
 
 /**
  * Sleeper for current thread.
@@ -33,7 +32,7 @@ import java.util.concurrent.atomic.Atomi
 public class Sleeper {
   private final Log LOG = LogFactory.getLog(this.getClass().getName());
   private final int period;
-  private final AtomicBoolean stop;
+  private final StopStatus stop;
   private static final long MINIMAL_DELTA_FOR_LOGGING = 10000;
 
   private final Object sleepLock = new Object();
@@ -43,7 +42,7 @@ public class Sleeper {
    * @param sleep sleep time in milliseconds
    * @param stop flag for when we stop
    */
-  public Sleeper(final int sleep, final AtomicBoolean stop) {
+  public Sleeper(final int sleep, final StopStatus stop) {
     this.period = sleep;
     this.stop = stop;
   }
@@ -72,7 +71,7 @@ public class Sleeper {
    * will be docked current time minus passed <code>startTime<code>.
    */
   public void sleep(final long startTime) {
-    if (this.stop.get()) {
+    if (stop.isStopped()) {
       return;
     }
     long now = System.currentTimeMillis();
@@ -101,7 +100,7 @@ public class Sleeper {
       } catch(InterruptedException iex) {
         // We we interrupted because we're meant to stop?  If not, just
         // continue ignoring the interruption
-        if (this.stop.get()) {
+        if (stop.isStopped()) {
           return;
         }
       }

Modified: hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java?rev=1308651&r1=1308650&r2=1308651&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java (original)
+++ hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java Tue Apr  3 02:04:26 2012
@@ -35,6 +35,7 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.NavigableSet;
@@ -52,14 +53,19 @@ import org.apache.hadoop.hbase.client.De
 import org.apache.hadoop.hbase.client.Get;
 import org.apache.hadoop.hbase.client.HBaseAdmin;
 import org.apache.hadoop.hbase.client.HConnection;
+import org.apache.hadoop.hbase.client.HConnectionManager;
 import org.apache.hadoop.hbase.client.HTable;
+import org.apache.hadoop.hbase.client.NoServerForRegionException;
 import org.apache.hadoop.hbase.client.Put;
 import org.apache.hadoop.hbase.client.Result;
 import org.apache.hadoop.hbase.client.ResultScanner;
+import org.apache.hadoop.hbase.client.RetriesExhaustedException;
 import org.apache.hadoop.hbase.client.Scan;
+import org.apache.hadoop.hbase.client.ServerConnectionManager;
 import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
 import org.apache.hadoop.hbase.io.hfile.Compression;
 import org.apache.hadoop.hbase.io.hfile.Compression.Algorithm;
+import org.apache.hadoop.hbase.ipc.HRegionInterface;
 import org.apache.hadoop.hbase.master.HMaster;
 import org.apache.hadoop.hbase.regionserver.HRegion;
 import org.apache.hadoop.hbase.regionserver.HRegionServer;
@@ -276,6 +282,7 @@ public class HBaseTestingUtility {
    */
   public void shutdownMiniDFSCluster() throws Exception {
     if (this.dfsCluster != null) {
+      FileSystem.closeAll();
       // The below throws an exception per dn, AsynchronousCloseException.
       this.dfsCluster.shutdown();
     }
@@ -390,8 +397,21 @@ public class HBaseTestingUtility {
     fs.mkdirs(hbaseRootdir);
     FSUtils.setVersion(fs, hbaseRootdir);
     startMiniHBaseCluster(numMasters, numSlaves);
+
     // Don't leave here till we've done a successful scan of the .META.
-    HTable t = new HTable(this.conf, HConstants.META_TABLE_NAME);
+    HTable t = null;
+    for (int i = 0; i < 10; ++i) {
+      try {
+        t = new HTable(this.conf, HConstants.META_TABLE_NAME);
+      } catch (NoServerForRegionException ex) {
+        LOG.error("META is not online, sleeping");
+        Threads.sleepWithoutInterrupt(2000);
+      }
+    }
+    if (t == null) {
+      throw new IOException("Could not open META on cluster startup");
+    }
+
     ResultScanner s = t.getScanner(new Scan());
     while (s.next() != null) continue;
     LOG.info("Minicluster is up");
@@ -1123,23 +1143,59 @@ public class HBaseTestingUtility {
   public void waitUntilAllRegionsAssigned(final int countOfRegions)
   throws IOException {
     HTable meta = new HTable(getConfiguration(), HConstants.META_TABLE_NAME);
+    HConnection connection = ServerConnectionManager.getConnection(conf);
+TOP_LOOP:
     while (true) {
       int rows = 0;
       Scan scan = new Scan();
       scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
-      ResultScanner s = meta.getScanner(scan);
+      ResultScanner s;
+      try {
+        s = meta.getScanner(scan);
+      } catch (RetriesExhaustedException ex) {
+        // This function has infinite patience.
+        Threads.sleepWithoutInterrupt(2000);
+        continue;
+      }
+      Map<String, HRegionInfo[]> regionAssignment =
+          new HashMap<String, HRegionInfo[]>();
+REGION_LOOP:
       for (Result r = null; (r = s.next()) != null;) {
         byte [] b =
           r.getValue(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
         if (b == null || b.length <= 0) break;
-        rows++;
+        // Make sure the regionserver really has this region.
+        String serverAddress = Bytes.toString(b);
+        if (!regionAssignment.containsKey(serverAddress)) {
+          HRegionInterface hri =
+            connection.getHRegionConnection(new HServerAddress(serverAddress),
+                false);
+          HRegionInfo[] regions;
+          try {
+            regions = hri.getRegionsAssignment();
+          } catch (IOException ex) {
+            LOG.info("Could not contact regionserver " + serverAddress);
+            Threads.sleepWithoutInterrupt(1000);
+            continue TOP_LOOP;
+          }
+          regionAssignment.put(serverAddress, regions);
+        }
+        String regionName = Bytes.toString(r.getRow());
+        for (HRegionInfo regInfo : regionAssignment.get(serverAddress)) {
+          String regNameOnRS = Bytes.toString(regInfo.getRegionName());
+          if (regNameOnRS.equals(regionName)) {
+            rows++;
+            continue REGION_LOOP;
+          }
+        }
       }
       s.close();
       // If I get to here and all rows have a Server, then all have been assigned.
-      if (rows == countOfRegions) break;
+      if (rows == countOfRegions)
+        break;
       LOG.info("Found " + rows + " open regions, waiting for " +
           countOfRegions);
-      Threads.sleep(1000);
+      Threads.sleepWithoutInterrupt(1000);
     }
   }
 

Modified: hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java?rev=1308651&r1=1308650&r2=1308651&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java (original)
+++ hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java Tue Apr  3 02:04:26 2012
@@ -340,7 +340,7 @@ public class MiniHBaseCluster {
     JVMClusterUtil.RegionServerThread server =
       hbaseCluster.getRegionServers().get(serverNumber);
     LOG.info("Stopping " + server.toString());
-    server.getRegionServer().stop();
+    server.getRegionServer().stop("normal shutdown");
     return server;
   }
 
@@ -535,10 +535,15 @@ public class MiniHBaseCluster {
   public HMaster killMaster(int serverNumber) {
     HMaster server = hbaseCluster.getMasters().get(serverNumber);
     LOG.info("Killing master " + server.toString());
-    server.killMaster();
+    server.stop("killing master");
     return server;
   }
 
+  public void killActiveMaster() {
+    LOG.info("Killing active master");
+    hbaseCluster.getActiveMaster().stop("killing master");
+  }
+
   /**
    * Blocks until there is an active master and that master has completed
    * initialization.

Modified: hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/MultiMasterTest.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/MultiMasterTest.java?rev=1308651&r1=1308650&r2=1308651&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/MultiMasterTest.java (original)
+++ hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/MultiMasterTest.java Tue Apr  3 02:04:26 2012
@@ -28,12 +28,14 @@ import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.HConstants;
 import org.apache.hadoop.hbase.HRegionInfo;
 import org.apache.hadoop.hbase.HServerAddress;
 import org.apache.hadoop.hbase.LocalHBaseCluster;
 import org.apache.hadoop.hbase.MiniHBaseCluster;
 import org.apache.hadoop.hbase.regionserver.HRegion;
 import org.apache.hadoop.hbase.regionserver.HRegionServer;
+import org.apache.hadoop.hbase.util.Threads;
 import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWrapper;
 import org.junit.After;
@@ -143,6 +145,16 @@ public class MultiMasterTest {
     return cluster;
   }
 
+  public void waitUntilRegionServersCheckIn(int numRS) {
+    while (true) {
+      HMaster master = cluster.getHBaseCluster().getActiveMaster();
+      if (master != null && master.getServerManager().numServers() >= numRS) {
+        return;
+      }
+      Threads.sleepWithoutInterrupt(HConstants.SOCKET_RETRY_WAIT_MS);
+    }
+  }
+
   protected void waitForActiveMasterAndVerify() throws InterruptedException {
     final List<HMaster> masters = miniCluster().getMasters();
     // wait for an active master to show up and be ready
@@ -159,7 +171,7 @@ public class MultiMasterTest {
       throws InterruptedException {
     HMaster master = cluster.getMaster(masterIndex);
     HServerAddress address = master.getHServerAddress();
-    master.killMaster();
+    master.stop("killing master in test");
     cluster.getHBaseCluster().waitOnMasterStop(masterIndex);
     return address;
   }

Modified: hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestClusterStartupDetection.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestClusterStartupDetection.java?rev=1308651&r1=1308650&r2=1308651&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestClusterStartupDetection.java (original)
+++ hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestClusterStartupDetection.java Tue Apr  3 02:04:26 2012
@@ -34,7 +34,7 @@ public class TestClusterStartupDetection
     assertEquals(activeMaster, getActiveMasterIndex());
 
     header("Killing the master");
-    miniCluster().getMaster().killMaster();
+    miniCluster().killActiveMaster();
     localCluster().waitOnMasterStop(0);
 
     killRegionServerWithMeta();
@@ -44,6 +44,7 @@ public class TestClusterStartupDetection
 
     assertFalse("Incorrectly identified a cluster restart as a fresh " +
         "cluster startup", newMaster.isClusterStartup());
+    waitUntilRegionServersCheckIn(numRS - 1);
   }
 
   @Test(timeout=240000)
@@ -67,6 +68,7 @@ public class TestClusterStartupDetection
     final HMaster newMaster = miniCluster().startNewMaster();
     assertFalse("The second master started after a delay thinks this is a " +
         "fresh cluster startup", newMaster.isClusterStartup());
+    waitUntilRegionServersCheckIn(numRS);
   }
 
   @Test(timeout=240000)
@@ -91,7 +93,7 @@ public class TestClusterStartupDetection
       final HMaster activeMaster = miniCluster().getMasters().get(activeIndex);
       header("Killing master");
       oldActiveName = activeMaster.getServerName();
-      activeMaster.killMaster();
+      activeMaster.stop("killing master");
       localCluster().waitOnMasterStop(activeIndex);
     }
 
@@ -108,6 +110,7 @@ public class TestClusterStartupDetection
     // startup anymore.
     assertFalse("The new active master incorrectly thinks that this is a " +
         "fresh cluster startup.", newActiveMaster.isClusterStartup());
+    waitUntilRegionServersCheckIn(numRS);
   }
 
 }

Modified: hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitAtStartup.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitAtStartup.java?rev=1308651&r1=1308650&r2=1308651&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitAtStartup.java (original)
+++ hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitAtStartup.java Tue Apr  3 02:04:26 2012
@@ -58,7 +58,7 @@ public class TestDistributedLogSplitAtSt
 
   @Test
   public void testDistributedLogSplitAtStartup() throws Exception {
-    DataLoader dataLoader = new DataLoader(conf);
+    DataLoader dataLoader = new DataLoader(conf, TEST_UTIL);
     new Thread(dataLoader).start();
     dataLoader.waitUntilHalfRowsLoaded();
     MiniHBaseCluster cluster = TEST_UTIL.getMiniHBaseCluster();
@@ -75,6 +75,7 @@ public class TestDistributedLogSplitAtSt
 
     LOG.info("Starting new region server");
     cluster.startRegionServer();
+    dataLoader.waitUntilRegionsAssigned();
     dataLoader.waitUntilFinishedOrFailed();
     dataLoader.join();
     dataLoader.assertSuccess();



Mime
View raw message