hbase-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From st...@apache.org
Subject svn commit: r998380 - in /hbase/trunk: ./ bin/ src/main/java/org/apache/hadoop/hbase/catalog/ src/main/java/org/apache/hadoop/hbase/ipc/ src/main/java/org/apache/hadoop/hbase/master/ src/main/java/org/apache/hadoop/hbase/regionserver/ src/test/java/org...
Date Sat, 18 Sep 2010 00:51:05 GMT
Author: stack
Date: Sat Sep 18 00:51:05 2010
New Revision: 998380

URL: http://svn.apache.org/viewvc?rev=998380&view=rev
Log:
HBASE-3010 Can't start/stop/start... cluster using new master 

Removed:
    hbase/trunk/src/test/java/org/apache/hadoop/hbase/master/TestMinimumServerCount.java
Modified:
    hbase/trunk/CHANGES.txt
    hbase/trunk/bin/stop-hbase.sh
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/ipc/HBaseServer.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMasterCommandLine.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
    hbase/trunk/src/test/java/org/apache/hadoop/hbase/master/TestActiveMasterManager.java

Modified: hbase/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hbase/trunk/CHANGES.txt?rev=998380&r1=998379&r2=998380&view=diff
==============================================================================
--- hbase/trunk/CHANGES.txt (original)
+++ hbase/trunk/CHANGES.txt Sat Sep 18 00:51:05 2010
@@ -526,6 +526,7 @@ Release 0.21.0 - Unreleased
    HBASE-3006  Reading compressed HFile blocks causes way too many DFS RPC
                calls severly impacting performance
                (Kannan Muthukkaruppan via Stack)
+   HBASE-3010  Can't start/stop/start... cluster using new master
 
   IMPROVEMENTS
    HBASE-1760  Cleanup TODOs in HTable

Modified: hbase/trunk/bin/stop-hbase.sh
URL: http://svn.apache.org/viewvc/hbase/trunk/bin/stop-hbase.sh?rev=998380&r1=998379&r2=998380&view=diff
==============================================================================
--- hbase/trunk/bin/stop-hbase.sh (original)
+++ hbase/trunk/bin/stop-hbase.sh Sat Sep 18 00:51:05 2010
@@ -55,6 +55,8 @@ while kill -0 `cat $pid` > /dev/null 2>&
   echo -n "."
   sleep 1;
 done
+# Add a CR after we're done w/ dots.
+echo
 
 # distributed == false means that the HMaster will kill ZK when it exits
 distMode=`$bin/hbase org.apache.hadoop.hbase.util.HBaseConfTool hbase.cluster.distributed`

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java?rev=998380&r1=998379&r2=998380&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java Sat Sep
18 00:51:05 2010
@@ -114,9 +114,6 @@ public class CatalogTracker {
   public void start() throws IOException, InterruptedException {
     this.rootRegionTracker.start();
     this.metaNodeTracker.start();
-    // Determine meta assignment; may not work because root and meta not yet
-    // deployed.  Calling the below will set {@link #metaLocation}.
-    getMetaServerConnection(true);
   }
 
   /**
@@ -205,7 +202,7 @@ public class CatalogTracker {
    */
   private HRegionInterface getRootServerConnection()
   throws IOException, InterruptedException {
-    HServerAddress address = rootRegionTracker.getRootRegionLocation();
+    HServerAddress address = this.rootRegionTracker.getRootRegionLocation();
     if (address == null) {
       return null;
     }

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/ipc/HBaseServer.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/ipc/HBaseServer.java?rev=998380&r1=998379&r2=998380&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/ipc/HBaseServer.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/ipc/HBaseServer.java Sat Sep 18 00:51:05
2010
@@ -20,18 +20,6 @@
 
 package org.apache.hadoop.hbase.ipc;
 
-import com.google.common.base.Function;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.hbase.util.Bytes;
-import org.apache.hadoop.io.ObjectWritable;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.io.WritableUtils;
-import org.apache.hadoop.security.UserGroupInformation;
-import org.apache.hadoop.util.ReflectionUtils;
-import org.apache.hadoop.util.StringUtils;
-
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.DataInputStream;
@@ -64,6 +52,18 @@ import java.util.concurrent.ExecutorServ
 import java.util.concurrent.Executors;
 import java.util.concurrent.LinkedBlockingQueue;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.ObjectWritable;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableUtils;
+import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hadoop.util.ReflectionUtils;
+import org.apache.hadoop.util.StringUtils;
+
+import com.google.common.base.Function;
+
 /** An abstract IPC service.  IPC calls take a single {@link Writable} as a
  * parameter, and return a {@link Writable} as their value.  A service runs on
  * a port and is defined by a parameter class and a value class.
@@ -151,7 +151,6 @@ public abstract class HBaseServer {
 
   protected Configuration conf;
 
-  @SuppressWarnings({"FieldCanBeLocal"})
   private int maxQueueSize;
   protected int socketSendBufferSize;
   protected final boolean tcpNoDelay;   // if T then disable Nagle's Algorithm
@@ -285,7 +284,6 @@ public abstract class HBaseServer {
         this.readSelector = readSelector;
       }
       public void run() {
-        LOG.info("Starting SocketReader");
         synchronized(this) {
           while (running) {
             SelectionKey key = null;

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java?rev=998380&r1=998379&r2=998380&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java Sat
Sep 18 00:51:05 2010
@@ -95,7 +95,7 @@ class ActiveMasterManager extends ZooKee
           clusterHasActiveMaster.set(true);
         } else {
           // Node is no longer there, cluster does not have an active master
-          LOG.debug("No master available. notifying waiting threads");
+          LOG.debug("No master available. Notifying waiting threads");
           clusterHasActiveMaster.set(false);
           // Notify any thread waiting to become the active master
           clusterHasActiveMaster.notifyAll();
@@ -114,46 +114,56 @@ class ActiveMasterManager extends ZooKee
    *
    * This also makes sure that we are watching the master znode so will be
    * notified if another master dies.
-   * @return False if we did not start up this cluster, another
-   * master did, or if a problem (zookeeper, stop flag has been set on this
-   * Master)
+   * @return True if no issue becoming active master else false if another
+   * master was running or if some other problem (zookeeper, stop flag has been
+   * set on this Master)
    */
   boolean blockUntilBecomingActiveMaster() {
-    boolean thisMasterStartedCluster = true;
+    boolean cleanSetOfActiveMaster = true;
     // Try to become the active master, watch if there is another master
     try {
-      if(ZKUtil.setAddressAndWatch(watcher, watcher.masterAddressZNode,
-          address)) {
+      if (ZKUtil.setAddressAndWatch(this.watcher,
+          this.watcher.masterAddressZNode, this.address)) {
         // We are the master, return
-        clusterHasActiveMaster.set(true);
-        return thisMasterStartedCluster;
+        this.clusterHasActiveMaster.set(true);
+        return cleanSetOfActiveMaster;
+      }
+
+      // There is another active master running elsewhere or this is a restart
+      // and the master ephemeral node has not expired yet.
+      this.clusterHasActiveMaster.set(true);
+      cleanSetOfActiveMaster = false;
+      HServerAddress currentMaster =
+        ZKUtil.getDataAsAddress(this.watcher, this.watcher.masterAddressZNode);
+      if (currentMaster != null && currentMaster.equals(this.address)) {
+        LOG.info("Current master has this master's address, " + currentMaster +
+          "; master was restarted?  Waiting on znode to expire...");
+        // Hurry along the expiration of the znode.
+        ZKUtil.deleteNode(this.watcher, this.watcher.masterAddressZNode);
+      } else {
+        LOG.info("Another master is the active master, " + currentMaster +
+          "; waiting to become the next active master");
       }
     } catch (KeeperException ke) {
       master.abort("Received an unexpected KeeperException, aborting", ke);
       return false;
     }
-    // There is another active master, this is not a cluster startup
-    // and we must wait until the active master dies
-    LOG.info("Another master is already the active master, waiting to become " +
-      "the next active master");
-    clusterHasActiveMaster.set(true);
-    thisMasterStartedCluster = false;
-    synchronized(clusterHasActiveMaster) {
-      while(clusterHasActiveMaster.get() && !master.isStopped()) {
+    synchronized (this.clusterHasActiveMaster) {
+      while (this.clusterHasActiveMaster.get() && !this.master.isStopped()) {
         try {
-          clusterHasActiveMaster.wait();
+          this.clusterHasActiveMaster.wait();
         } catch (InterruptedException e) {
           // We expect to be interrupted when a master dies, will fall out if so
           LOG.debug("Interrupted waiting for master to die", e);
         }
       }
-      if(master.isStopped()) {
-        return thisMasterStartedCluster;
+      if (this.master.isStopped()) {
+        return cleanSetOfActiveMaster;
       }
       // Try to become active master again now that there is no active master
       blockUntilBecomingActiveMaster();
     }
-    return thisMasterStartedCluster;
+    return cleanSetOfActiveMaster;
   }
 
   /**

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java?rev=998380&r1=998379&r2=998380&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java Sat Sep
18 00:51:05 2010
@@ -157,8 +157,8 @@ public class AssignmentManager extends Z
   void cleanoutUnassigned() throws IOException, KeeperException {
     // Cleanup any existing ZK nodes and start watching
     ZKAssign.deleteAllNodes(watcher);
-    ZKUtil.listChildrenAndWatchForNewChildren(watcher,
-        watcher.assignmentZNode);
+    ZKUtil.listChildrenAndWatchForNewChildren(this.watcher,
+      this.watcher.assignmentZNode);
   }
 
   /**
@@ -545,7 +545,7 @@ public class AssignmentManager extends Z
       if (plan == null) {
         LOG.debug("No previous transition plan for " +
             state.getRegion().getRegionNameAsString() +
-            " so generating a random one from " + serverManager.numServers() +
+            " so generating a random one from " + serverManager.countOfRegionServers() +
             " ( " + serverManager.getOnlineServers().size() + ") available servers");
         plan = new RegionPlan(state.getRegion(), null,
           LoadBalancer.randomAssignment(serverManager.getOnlineServersList()));

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=998380&r1=998379&r2=998380&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Sat Sep 18 00:51:05
2010
@@ -51,6 +51,7 @@ import org.apache.hadoop.hbase.UnknownRe
 import org.apache.hadoop.hbase.catalog.CatalogTracker;
 import org.apache.hadoop.hbase.catalog.MetaEditor;
 import org.apache.hadoop.hbase.catalog.MetaReader;
+import org.apache.hadoop.hbase.catalog.RootLocationEditor;
 import org.apache.hadoop.hbase.client.HConnection;
 import org.apache.hadoop.hbase.client.HConnectionManager;
 import org.apache.hadoop.hbase.client.MetaScanner;
@@ -145,12 +146,14 @@ implements HMasterInterface, HMasterRegi
   // Cluster status zk tracker and local setter
   private ClusterStatusTracker clusterStatusTracker;
 
-  // True if this is the master that started the cluster.
-  boolean clusterStarter;
-
-  // This flag is for stopping this Master instance.
-  private boolean stopped = false;
-  // Set on abort -- usually failure of our zk session
+  // True if this a cluster startup where there are no already running servers
+  // as opposed to a master joining an already running cluster
+  boolean freshClusterStartup;
+
+  // This flag is for stopping this Master instance.  Its set when we are
+  // stopping or aborting
+  private volatile boolean stopped = false;
+  // Set on abort -- usually failure of our zk session.
   private volatile boolean abort = false;
 
   // Instance of the hbase executor service.
@@ -178,17 +181,17 @@ implements HMasterInterface, HMasterRegi
     this.conf = conf;
     /*
      * 1. Determine address and initialize RPC server (but do not start).
-     * The RPC server ports can be ephemeral.
+     * The RPC server ports can be ephemeral. Create a ZKW instance.
      */
     HServerAddress a = new HServerAddress(getMyAddress(this.conf));
     int numHandlers = conf.getInt("hbase.regionserver.handler.count", 10);
     this.rpcServer = HBaseRPC.getServer(this,
-	new Class<?>[]{HMasterInterface.class, HMasterRegionInterface.class},
-        a.getBindAddress(), a.getPort(),
-        numHandlers,
-        0, // we dont use high priority handlers in master
-        false, conf,
-        0); // this is a DNC w/o high priority handlers
+      new Class<?>[]{HMasterInterface.class, HMasterRegionInterface.class},
+      a.getBindAddress(), a.getPort(),
+      numHandlers,
+      0, // we dont use high priority handlers in master
+      false, conf,
+      0); // this is a DNC w/o high priority handlers
     this.address = new HServerAddress(rpcServer.getListenerAddress());
 
     // set the thread name now we have an address
@@ -201,24 +204,11 @@ implements HMasterInterface, HMasterRegi
         "_" + System.currentTimeMillis());
     }
 
-    /*
-     * 2. Determine if this is a fresh cluster startup or failed over master.
-     * This is done by checking for the existence of any ephemeral
-     * RegionServer nodes in ZooKeeper.  These nodes are created by RSs on
-     * their initialization but only after they find the primary master.  As
-     * long as this check is done before we write our address into ZK, this
-     * will work.  Note that multiple masters could find this to be true on
-     * startup (none have become active master yet), which is why there is an
-     * additional check if this master does not become primary on its first attempt.
-     */
     this.zooKeeper =
       new ZooKeeperWatcher(conf, MASTER + "-" + getMasterAddress(), this);
 
-    this.clusterStarter = 0 ==
-      ZKUtil.getNumberOfChildren(zooKeeper, zooKeeper.rsZNode);
-
     /*
-     * 3. Block on becoming the active master.
+     * 2. Block on becoming the active master.
      * We race with other masters to write our address into ZooKeeper.  If we
      * succeed, we are the primary/active master and finish initialization.
      *
@@ -228,32 +218,25 @@ implements HMasterInterface, HMasterRegi
      */
     this.activeMasterManager = new ActiveMasterManager(zooKeeper, address, this);
     this.zooKeeper.registerListener(activeMasterManager);
+    stallIfBackupMaster(this.conf, this.activeMasterManager);
+    activeMasterManager.blockUntilBecomingActiveMaster();
 
+    /*
+     * 3. Determine if this is a fresh cluster startup or failed over master.
+     * This is done by checking for the existence of any ephemeral
+     * RegionServer nodes in ZooKeeper.  These nodes are created by RSs on
+     * their initialization but initialization will not happen unless clusterup
+     * flag is set -- see ClusterStatusTracker below.
+     */
+    this.freshClusterStartup =
+      0 == ZKUtil.getNumberOfChildren(zooKeeper, zooKeeper.rsZNode);
 
-    // If we're a backup master, stall until a primary to writes his address
-    if (conf.getBoolean(HConstants.MASTER_TYPE_BACKUP,
-        HConstants.DEFAULT_MASTER_TYPE_BACKUP)) {
-      // This will only be a minute or so while the cluster starts up,
-      // so don't worry about setting watches on the parent znode
-      while (!this.activeMasterManager.isActiveMaster()) {
-        try {
-          LOG.debug("Waiting for master address ZNode to be written " +
-            "(Also watching cluster state node)");
-          Thread.sleep(conf.getInt("zookeeper.session.timeout", 60 * 1000));
-        } catch (InterruptedException e) {
-          // interrupted = user wants to kill us.  Don't continue
-          throw new IOException("Interrupted waiting for master address");
-        }
-      }
-    }
-
-    // Wait here until we are the active master
-    clusterStarter = activeMasterManager.blockUntilBecomingActiveMaster();
-
-    /**
+    /*
      * 4. We are active master now... go initialize components we need to run.
+     * Note, there may be dross in zk from previous runs; it'll get addressed
+     * when we enter {@link #run()} below.
      */
-    // TODO: Do this using Dependency Injection, using PicoContainer or Spring.
+    // TODO: Do this using Dependency Injection, using PicoContainer, Guice or Spring.
     this.fileSystemManager = new MasterFileSystem(this);
     this.connection = HConnectionManager.getConnection(conf);
     this.executorService = new ExecutorService(getServerName());
@@ -270,18 +253,40 @@ implements HMasterInterface, HMasterRegi
 
     this.regionServerTracker = new RegionServerTracker(zooKeeper, this,
       this.serverManager);
-    regionServerTracker.start();
+    this.regionServerTracker.start();
 
-    // Set the cluster as up.
+    // Set the cluster as up.  If new RSs, they'll be waiting on this before
+    // going ahead with their startup.
     this.clusterStatusTracker = new ClusterStatusTracker(getZooKeeper(), this);
     this.clusterStatusTracker.setClusterUp();
     this.clusterStatusTracker.start();
 
     LOG.info("Server active/primary master; " + this.address +
-      "; clusterStarter=" + this.clusterStarter + ", sessionid=0x" +
+      "; freshClusterStart=" + this.freshClusterStartup + ", sessionid=0x" +
       Long.toHexString(this.zooKeeper.getZooKeeper().getSessionId()));
   }
 
+  /*
+   * Stall startup if we are designated a backup master.
+   * @param c
+   * @param amm
+   * @throws InterruptedException
+   */
+  private static void stallIfBackupMaster(final Configuration c,
+      final ActiveMasterManager amm)
+  throws InterruptedException {
+    // If we're a backup master, stall until a primary to writes his address
+    if (!c.getBoolean(HConstants.MASTER_TYPE_BACKUP,
+      HConstants.DEFAULT_MASTER_TYPE_BACKUP)) return;
+    // This will only be a minute or so while the cluster starts up,
+    // so don't worry about setting watches on the parent znode
+    while (!amm.isActiveMaster()) {
+      LOG.debug("Waiting for master address ZNode to be written " +
+        "(Also watching cluster state node)");
+      Thread.sleep(c.getInt("zookeeper.session.timeout", 60 * 1000));
+    }
+  }
+
   /**
    * Main processing loop for the HMaster.
    * 1. Handle both fresh cluster start as well as failed over initialization of
@@ -295,22 +300,24 @@ implements HMasterInterface, HMasterRegi
     try {
       // start up all service threads.
       startServiceThreads();
-      // wait for minimum number of region servers to be up
-      this.serverManager.waitForMinServers();
-      // start assignment of user regions, startup or failure
-      if (this.clusterStarter) {
-        clusterStarterInitializations(this.fileSystemManager,
+      // Wait for minimum number of region servers to report in
+      this.serverManager.waitForRegionServers();
+
+      // Start assignment of user regions, startup or failure
+      if (!this.stopped) {
+        if (this.freshClusterStartup) {
+          clusterStarterInitializations(this.fileSystemManager,
             this.serverManager, this.catalogTracker, this.assignmentManager);
-      } else {
-        // Process existing unassigned nodes in ZK, read all regions from META,
-        // rebuild in-memory state.
-        this.assignmentManager.processFailover();
+        } else {
+          // Process existing unassigned nodes in ZK, read all regions from META,
+          // rebuild in-memory state.
+          this.assignmentManager.processFailover();
+        }
       }
+
       // Check if we should stop every second.
       Sleeper sleeper = new Sleeper(1000, this);
-      while (!this.stopped  && !this.abort) {
-        sleeper.sleep();
-      }
+      while (!this.stopped) sleeper.sleep();
     } catch (Throwable t) {
       abort("Unhandled exception. Starting shutdown.", t);
     }
@@ -341,22 +348,17 @@ implements HMasterInterface, HMasterRegi
 
   /*
    * Initializations we need to do if we are cluster starter.
-   * @param starter
    * @param mfs
+   * @param sm
+   * @param ct
+   * @param am
    * @throws IOException
    */
   private static void clusterStarterInitializations(final MasterFileSystem mfs,
     final ServerManager sm, final CatalogTracker ct, final AssignmentManager am)
   throws IOException, InterruptedException, KeeperException {
-      // This master is starting the cluster (its not a preexisting cluster
-      // that this master is joining).
-      // Initialize the filesystem, which does the following:
-      //   - Creates the root hbase directory in the FS if DNE
-      //   - If fresh start, create first ROOT and META regions (bootstrap)
-      //   - Checks the FS to make sure the root directory is readable
-      //   - Creates the archive directory for logs
+      // Check filesystem has required basics
       mfs.initialize();
-      // Do any log splitting necessary
       // TODO: Should do this in background rather than block master startup
       // TODO: Do we want to do this before/while/after RSs check in?
       //       It seems that this method looks at active RSs but happens
@@ -795,6 +797,7 @@ implements HMasterInterface, HMasterRegi
     if (t != null) LOG.fatal(msg, t);
     else LOG.fatal(msg);
     this.abort = true;
+    stop("Aborting");
   }
 
   @Override

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMasterCommandLine.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMasterCommandLine.java?rev=998380&r1=998379&r2=998380&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMasterCommandLine.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMasterCommandLine.java Sat Sep
18 00:51:05 2010
@@ -19,8 +19,8 @@
  */
 package org.apache.hadoop.hbase.master;
 
-import java.io.IOException;
 import java.io.File;
+import java.io.IOException;
 import java.util.List;
 
 import org.apache.commons.cli.CommandLine;
@@ -30,11 +30,10 @@ import org.apache.commons.cli.ParseExcep
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.hbase.ZooKeeperConnectionException;
-import org.apache.hadoop.hbase.HBaseConfiguration;
 import org.apache.hadoop.hbase.HConstants;
 import org.apache.hadoop.hbase.LocalHBaseCluster;
 import org.apache.hadoop.hbase.MasterNotRunningException;
+import org.apache.hadoop.hbase.ZooKeeperConnectionException;
 import org.apache.hadoop.hbase.client.HBaseAdmin;
 import org.apache.hadoop.hbase.regionserver.HRegionServer;
 import org.apache.hadoop.hbase.util.ServerCommandLine;

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java?rev=998380&r1=998379&r2=998380&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java Sat Sep
18 00:51:05 2010
@@ -83,8 +83,11 @@ public class MasterFileSystem {
   }
 
   /**
+   * Create initial layout in filesystem.
    * <ol>
-   * <li>Check if the root region exists and is readable, if not create it</li>
+   * <li>Check if the root region exists and is readable, if not create it.
+   * Create hbase.version and the -ROOT- directory if not one.
+   * </li>
    * <li>Create a log archive directory for RS to put archived logs</li>
    * </ol>
    */

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java?rev=998380&r1=998379&r2=998380&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java Sat Sep 18
00:51:05 2010
@@ -86,8 +86,6 @@ public class ServerManager {
 
   private final ServerMonitor serverMonitorThread;
 
-  private int minimumServerCount;
-
   private final LogCleaner logCleaner;
 
   // Reporting to track master metrics.
@@ -106,7 +104,7 @@ public class ServerManager {
 
     @Override
     protected void chore() {
-      int numServers = numServers();
+      int numServers = countOfRegionServers();
       int numDeadServers = deadservers.size();
       double averageLoad = getAverageLoad();
       String deadServersList = deadservers.toString();
@@ -127,7 +125,6 @@ public class ServerManager {
     this.services = services;
     Configuration c = master.getConfiguration();
     int monitorInterval = c.getInt("hbase.master.monitor.interval", 60 * 1000);
-    this.minimumServerCount = c.getInt("hbase.regions.server.count.min", 1);
     this.metrics = new MasterMetrics(master.getServerName());
     this.serverMonitorThread = new ServerMonitor(monitorInterval, master);
     String n = Thread.currentThread().getName();
@@ -220,8 +217,8 @@ public class ServerManager {
     info.setLoad(load);
     // TODO: Why did we update the RS location ourself?  Shouldn't RS do this?
     // masterStatus.getZooKeeper().updateRSLocationGetWatch(info, watcher);
-    onlineServers.put(serverName, info);
-    if(hri == null) {
+    this.onlineServers.put(serverName, info);
+    if (hri == null) {
       serverConnections.remove(serverName);
     } else {
       serverConnections.put(serverName, hri);
@@ -277,7 +274,7 @@ public class ServerManager {
     }
 
     HMsg [] reply = null;
-    int numservers = numServers();
+    int numservers = countOfRegionServers();
     if (this.clusterShutdown) {
       if (numservers <= 2) {
         // Shutdown needs to be staggered; the meta regions need to close last
@@ -362,14 +359,10 @@ public class ServerManager {
     return averageLoad;
   }
 
-  /** @return the number of active servers */
-  public int numServers() {
-    int num = -1;
-    // This synchronized seems gratuitous.
-    synchronized (this.onlineServers) {
-      num = this.onlineServers.size();
-    }
-    return num;
+  /** @return the count of active regionservers */
+  int countOfRegionServers() {
+    // Presumes onlineServers is a concurrent map
+    return this.onlineServers.size();
   }
 
   /**
@@ -476,17 +469,6 @@ public class ServerManager {
       " to dead servers, submitted shutdown handler to be executed");
   }
 
-  public boolean canAssignUserRegions() {
-    if (minimumServerCount == 0) {
-      return true;
-    }
-    return (numServers() >= minimumServerCount);
-  }
-
-  public void setMinimumServerCount(int minimumServerCount) {
-    this.minimumServerCount = minimumServerCount;
-  }
-
   // RPC methods to region servers
 
   /**
@@ -546,18 +528,25 @@ public class ServerManager {
   }
 
   /**
-   * Waits for the minimum number of servers to be running.
+   * Waits for the regionservers to report in.
+   * @throws InterruptedException 
    */
-  public void waitForMinServers() {
-    while(numServers() < minimumServerCount) {
-//        !masterStatus.getShutdownRequested().get()) {
-      LOG.info("Waiting for enough servers to check in.  Currently have " +
-          numServers() + " but need at least " + minimumServerCount);
-      try {
-        Thread.sleep(1000);
-      } catch (InterruptedException e) {
-        LOG.warn("Got interrupted waiting for servers to check in, looping");
+  public void waitForRegionServers()
+  throws InterruptedException {
+    long interval = this.master.getConfiguration().
+      getLong("hbase.master.wait.on.regionservers.interval", 3000);
+    // So, number of regionservers > 0 and its been n since last check in, break,
+    // else just stall here
+    for (int oldcount = countOfRegionServers(); !this.master.isStopped();) {
+      Thread.sleep(interval);
+      int count = countOfRegionServers();
+      if (count == oldcount && count > 0) break;
+      if (count == 0) {
+        LOG.info("Waiting on regionserver(s) to checkin");
+      } else {
+        LOG.info("Waiting on regionserver(s) count to settle; currently=" + count);
       }
+      oldcount = count;
     }
   }
 
@@ -571,8 +560,8 @@ public class ServerManager {
   }
 
   public void shutdownCluster() {
-    LOG.info("Cluster shutdown requested");
     this.clusterShutdown = true;
+    this.master.stop("Cluster shutdown requested");
   }
 
   public boolean isClusterShutdown() {

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java?rev=998380&r1=998379&r2=998380&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java Sat
Sep 18 00:51:05 2010
@@ -265,7 +265,7 @@ public class HRegionServer implements HR
    * @throws InterruptedException 
    */
   public HRegionServer(Configuration conf) throws IOException, InterruptedException {
-    machineName = DNS.getDefaultHost(conf.get(
+    this.machineName = DNS.getDefaultHost(conf.get(
         "hbase.regionserver.dns.interface", "default"), conf.get(
         "hbase.regionserver.dns.nameserver", "default"));
     String addressStr = machineName
@@ -434,18 +434,18 @@ public class HRegionServer implements HR
     zooKeeper = new ZooKeeperWatcher(conf, REGIONSERVER + "-"
         + serverInfo.getServerName(), this);
 
+    this.clusterStatusTracker = new ClusterStatusTracker(this.zooKeeper, this);
+    this.clusterStatusTracker.start();
+    this.clusterStatusTracker.blockUntilAvailable();
+
     // create the master address manager, register with zk, and start it
     masterAddressManager = new MasterAddressTracker(zooKeeper, this);
     masterAddressManager.start();
 
-    // create the catalog tracker and start it
+    // Create the catalog tracker and start it; 
     this.catalogTracker = new CatalogTracker(this.zooKeeper, this.connection,
       this, this.conf.getInt("hbase.regionserver.catalog.timeout", Integer.MAX_VALUE));
     catalogTracker.start();
-
-    this.clusterStatusTracker = new ClusterStatusTracker(this.zooKeeper, this);
-    this.clusterStatusTracker.start();
-    this.clusterStatusTracker.blockUntilAvailable();
   }
 
   /**

Modified: hbase/trunk/src/test/java/org/apache/hadoop/hbase/master/TestActiveMasterManager.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/test/java/org/apache/hadoop/hbase/master/TestActiveMasterManager.java?rev=998380&r1=998379&r2=998380&view=diff
==============================================================================
--- hbase/trunk/src/test/java/org/apache/hadoop/hbase/master/TestActiveMasterManager.java
(original)
+++ hbase/trunk/src/test/java/org/apache/hadoop/hbase/master/TestActiveMasterManager.java
Sat Sep 18 00:51:05 2010
@@ -23,6 +23,7 @@ import static org.junit.Assert.assertFal
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
 
+import java.io.IOException;
 import java.util.concurrent.Semaphore;
 
 import org.apache.commons.logging.Log;
@@ -57,6 +58,39 @@ public class TestActiveMasterManager {
     TEST_UTIL.shutdownMiniZKCluster();
   }
 
+  @Test public void testRestartMaster() throws IOException, KeeperException {
+    ZooKeeperWatcher zk = new ZooKeeperWatcher(TEST_UTIL.getConfiguration(),
+      "testActiveMasterManagerFromZK", null);
+    ZKUtil.createAndFailSilent(zk, zk.baseZNode);
+    try {
+      ZKUtil.deleteNode(zk, zk.masterAddressZNode);
+    } catch(KeeperException.NoNodeException nne) {}
+
+    // Create the master node with a dummy address
+    HServerAddress master = new HServerAddress("localhost", 1);
+    // Should not have a master yet
+    DummyMaster dummyMaster = new DummyMaster();
+    ActiveMasterManager activeMasterManager = new ActiveMasterManager(zk,
+      master, dummyMaster);
+    zk.registerListener(activeMasterManager);
+    assertFalse(activeMasterManager.clusterHasActiveMaster.get());
+
+    // First test becoming the active master uninterrupted
+    activeMasterManager.blockUntilBecomingActiveMaster();
+    assertTrue(activeMasterManager.clusterHasActiveMaster.get());
+    assertMaster(zk, master);
+
+    // Now pretend master restart
+    DummyMaster secondDummyMaster = new DummyMaster();
+    ActiveMasterManager secondActiveMasterManager = new ActiveMasterManager(zk,
+      master, secondDummyMaster);
+    zk.registerListener(secondActiveMasterManager);
+    assertFalse(secondActiveMasterManager.clusterHasActiveMaster.get());
+    activeMasterManager.blockUntilBecomingActiveMaster();
+    assertTrue(activeMasterManager.clusterHasActiveMaster.get());
+    assertMaster(zk, master);
+  }
+
   /**
    * Unit tests that uses ZooKeeper but does not use the master-side methods
    * but rather acts directly on ZK.
@@ -64,22 +98,21 @@ public class TestActiveMasterManager {
    */
   @Test
   public void testActiveMasterManagerFromZK() throws Exception {
-
     ZooKeeperWatcher zk = new ZooKeeperWatcher(TEST_UTIL.getConfiguration(),
-        "testActiveMasterManagerFromZK", null);
+      "testActiveMasterManagerFromZK", null);
     ZKUtil.createAndFailSilent(zk, zk.baseZNode);
     try {
       ZKUtil.deleteNode(zk, zk.masterAddressZNode);
     } catch(KeeperException.NoNodeException nne) {}
 
     // Create the master node with a dummy address
-    HServerAddress firstMasterAddress = new HServerAddress("firstMaster", 1234);
-    HServerAddress secondMasterAddress = new HServerAddress("secondMaster", 1234);
+    HServerAddress firstMasterAddress = new HServerAddress("localhost", 1);
+    HServerAddress secondMasterAddress = new HServerAddress("localhost", 2);
 
     // Should not have a master yet
     DummyMaster ms1 = new DummyMaster();
     ActiveMasterManager activeMasterManager = new ActiveMasterManager(zk,
-        firstMasterAddress, ms1);
+      firstMasterAddress, ms1);
     zk.registerListener(activeMasterManager);
     assertFalse(activeMasterManager.clusterHasActiveMaster.get());
 
@@ -132,6 +165,9 @@ public class TestActiveMasterManager {
 
     assertTrue(t.manager.clusterHasActiveMaster.get());
     assertTrue(t.isActiveMaster);
+
+    LOG.info("Deleting master node");
+    ZKUtil.deleteNode(zk, zk.masterAddressZNode);
   }
 
   /**



Mime
View raw message