hbase-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jdcry...@apache.org
Subject svn commit: r830827 - in /hadoop/hbase/branches/0.20: ./ src/java/org/apache/hadoop/hbase/master/ src/java/org/apache/hadoop/hbase/zookeeper/ src/test/org/apache/hadoop/hbase/
Date Thu, 29 Oct 2009 02:13:29 GMT
Author: jdcryans
Date: Thu Oct 29 02:13:28 2009
New Revision: 830827

URL: http://svn.apache.org/viewvc?rev=830827&view=rev
Log:
HBASE-1921  When the Master's session times out and there's only one, cluster is wedged

Modified:
    hadoop/hbase/branches/0.20/CHANGES.txt
    hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/HMaster.java
    hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/RegionManager.java
    hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/ServerManager.java
    hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWrapper.java
    hadoop/hbase/branches/0.20/src/test/org/apache/hadoop/hbase/TestZooKeeper.java

Modified: hadoop/hbase/branches/0.20/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.20/CHANGES.txt?rev=830827&r1=830826&r2=830827&view=diff
==============================================================================
--- hadoop/hbase/branches/0.20/CHANGES.txt (original)
+++ hadoop/hbase/branches/0.20/CHANGES.txt Thu Oct 29 02:13:28 2009
@@ -26,6 +26,7 @@
    HBASE-1899  Use scanner caching in shell count
    HBASE-1903  Enable DEBUG by default
    HBASE-1918  Don't do DNS resolving in .META. scanner for each row
+   HBASE-1921  When the Master's session times out and there's only one, cluster is wedged
 
 Release 0.20.1 - Released October 12th, 2009
   INCOMPATIBLE CHANGES

Modified: hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/HMaster.java?rev=830827&r1=830826&r2=830827&view=diff
==============================================================================
--- hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/HMaster.java Thu Oct
29 02:13:28 2009
@@ -126,7 +126,7 @@
   final int numRetries;
   final long maxRegionOpenTime;
   final int leaseTimeout;
-  private final ZooKeeperWrapper zooKeeperWrapper;
+  private ZooKeeperWrapper zooKeeperWrapper;
   private final ZKMasterAddressWatcher zkMasterAddressWatcher;
 
   volatile DelayQueue<RegionServerOperation> delayedToDoQueue =
@@ -251,27 +251,32 @@
     serverManager = new ServerManager(this);
     regionManager = new RegionManager(this);
     
-    writeAddressToZooKeeper();
+    writeAddressToZooKeeper(true);
     
     // We're almost open for business
     this.closed.set(false);
     LOG.info("HMaster initialized on " + this.address.toString());
   }
 
-  private void writeAddressToZooKeeper() {
-    while (true) {
+  /*
+   * Return true if we are the master, false if the cluster must shut down
+   * or if we only retry once.
+   */
+  private boolean writeAddressToZooKeeper(boolean retry) {
+    do {
       zkMasterAddressWatcher.waitForMasterAddressAvailability();
       // Check if we need to shutdown instead of taking control
-      if(this.shutdownRequested.get())
-      {
-        return;
+      if(this.shutdownRequested.get()){
+        LOG.debug("Won't start Master because cluster is shuting down");
+        return false;
       } else if(zooKeeperWrapper.writeMasterAddress(address)) {
         zooKeeperWrapper.setClusterState(true);
         // Watch our own node
         zooKeeperWrapper.readMasterAddress(this);
-        return;
+        return true;
       }
-    }
+    } while (retry);
+    return false;
   }
 
   private void bootstrap() throws IOException {
@@ -1152,8 +1157,25 @@
             event.getPath().equals(
                 this.zooKeeperWrapper.getMasterElectionZNode())) 
                 && !shutdownRequested.get()) {
-      LOG.error("Master lost its znode, killing itself now");
-      System.exit(1);
+
+      LOG.info("Master lost its znode, trying to get a new one");
+
+      // Can we still be the master? If not, goodbye
+
+      zooKeeperWrapper.close();
+      try {
+        zooKeeperWrapper = new ZooKeeperWrapper(conf, this);
+
+        if(!writeAddressToZooKeeper(false)) {
+          throw new Exception("Another Master is currently active");
+        }
+
+        // Verify the cluster to see if anything happened while we were away
+        verifyClusterState();
+      } catch (Exception e) {
+        LOG.error("Killing master because of", e);
+        System.exit(1);
+      }
     }
   }
    

Modified: hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/RegionManager.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/RegionManager.java?rev=830827&r1=830826&r2=830827&view=diff
==============================================================================
--- hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/RegionManager.java
(original)
+++ hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/RegionManager.java
Thu Oct 29 02:13:28 2009
@@ -127,8 +127,6 @@
     regionsToFlush = Collections.synchronizedSortedMap(
         new TreeMap<byte[],Pair<HRegionInfo,HServerAddress>>
         (Bytes.BYTES_COMPARATOR));
-
-  private final ZooKeeperWrapper zooKeeperWrapper;
   private final int zooKeeperNumRetries;
   private final int zooKeeperPause;
 
@@ -145,7 +143,6 @@
     // Scans the meta table
     metaScannerThread = new MetaScanner(master);
 
-    zooKeeperWrapper = master.getZooKeeperWrapper();
     zooKeeperNumRetries = conf.getInt(ZOOKEEPER_RETRIES, DEFAULT_ZOOKEEPER_RETRIES);
     zooKeeperPause = conf.getInt(ZOOKEEPER_PAUSE, DEFAULT_ZOOKEEPER_PAUSE);
 
@@ -611,8 +608,8 @@
     } catch(Exception iex) {
       LOG.warn("meta scanner", iex);
     }
-    zooKeeperWrapper.clearRSDirectory();
-    zooKeeperWrapper.close();
+    master.getZooKeeperWrapper().clearRSDirectory();
+    master.getZooKeeperWrapper().close();
   }
   
   /**
@@ -1074,7 +1071,7 @@
 
   private boolean tellZooKeeperOutOfSafeMode() {
     for (int attempt = 0; attempt < zooKeeperNumRetries; ++attempt) {
-      if (zooKeeperWrapper.writeOutOfSafeMode()) {
+      if (master.getZooKeeperWrapper().writeOutOfSafeMode()) {
         return true;
       }
 
@@ -1166,7 +1163,7 @@
 
   private void writeRootRegionLocationToZooKeeper(HServerAddress address) {
     for (int attempt = 0; attempt < zooKeeperNumRetries; ++attempt) {
-      if (zooKeeperWrapper.writeRootRegionLocation(address)) {
+      if (master.getZooKeeperWrapper().writeRootRegionLocation(address)) {
         return;
       }
 

Modified: hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/ServerManager.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/ServerManager.java?rev=830827&r1=830826&r2=830827&view=diff
==============================================================================
--- hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/ServerManager.java
(original)
+++ hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/master/ServerManager.java
Thu Oct 29 02:13:28 2009
@@ -69,7 +69,6 @@
   private static final HMsg [] EMPTY_HMSG_ARRAY = new HMsg[0];
   
   private final AtomicInteger quiescedServers = new AtomicInteger(0);
-  private final ZooKeeperWrapper zooKeeperWrapper;
 
   /** The map of known server names to server info */
   final Map<String, HServerInfo> serversToServerInfo =
@@ -140,7 +139,6 @@
    */
   public ServerManager(HMaster master) {
     this.master = master;
-    zooKeeperWrapper = master.getZooKeeperWrapper();
     this.nobalancingCount = master.getConfiguration().
       getInt("hbase.regions.nobalancing.count", 4);
     serverMonitorThread = new ServerMonitor(master.metaRescanInterval,
@@ -218,7 +216,7 @@
     // We must set this watcher here because it can be set on a fresh start
     // or on a failover
     Watcher watcher = new ServerExpirer(serverName, info.getServerAddress());
-    zooKeeperWrapper.updateRSLocationGetWatch(info, watcher);
+    master.getZooKeeperWrapper().updateRSLocationGetWatch(info, watcher);
     serversToServerInfo.put(serverName, info);
     serverAddressToServerInfo.put(info.getServerAddress(), info);
     serversToLoad.put(serverName, load);

Modified: hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWrapper.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWrapper.java?rev=830827&r1=830826&r2=830827&view=diff
==============================================================================
--- hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWrapper.java
(original)
+++ hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWrapper.java
Thu Oct 29 02:13:28 2009
@@ -317,6 +317,7 @@
     try {
       return readAddressOrThrow(znode, watcher);
     } catch (IOException e) {
+      e.printStackTrace();
       return null;
     }
   }
@@ -493,7 +494,7 @@
 
     try {
       zooKeeper.create(outOfSafeModeZNode, new byte[0], Ids.OPEN_ACL_UNSAFE,
-                       CreateMode.EPHEMERAL);
+                       CreateMode.PERSISTENT);
       LOG.debug("Wrote out of safe mode");
       return true;
     } catch (InterruptedException e) {

Modified: hadoop/hbase/branches/0.20/src/test/org/apache/hadoop/hbase/TestZooKeeper.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.20/src/test/org/apache/hadoop/hbase/TestZooKeeper.java?rev=830827&r1=830826&r2=830827&view=diff
==============================================================================
--- hadoop/hbase/branches/0.20/src/test/org/apache/hadoop/hbase/TestZooKeeper.java (original)
+++ hadoop/hbase/branches/0.20/src/test/org/apache/hadoop/hbase/TestZooKeeper.java Thu Oct
29 02:13:28 2009
@@ -109,47 +109,53 @@
     ZooKeeper zk = new ZooKeeper(quorumServers, sessionTimeout, EmptyWatcher.instance, sessionID,
password);
     zk.close();
 
-    Thread.sleep(sessionTimeout * 3);
+    Thread.sleep(sessionTimeout * 2);
 
     System.err.println("ZooKeeper should have timed out");
     connection.relocateRegion(HConstants.ROOT_TABLE_NAME, HConstants.EMPTY_BYTE_ARRAY);
   }
 
-  public void testRegionServerSessionExpired() {
-    try {
-      this.conf.setBoolean("hbase.regionserver.restart.on.zk.expire", true);
-      new HTable(conf, HConstants.META_TABLE_NAME);
-  
-      ZooKeeperWrapper zkw = new ZooKeeperWrapper(conf, EmptyWatcher.instance);
-      String quorumServers = zkw.getQuorumServers();
-      int sessionTimeout = conf.getInt("zookeeper.session.timeout", 2 * 1000);
-
-      HRegionServer rs = cluster.getRegionServer(0);
-      ZooKeeperWrapper rsZK = rs.getZooKeeperWrapper();
-      long sessionID = rsZK.getSessionID();
-      byte[] password = rsZK.getSessionPassword();
-  
-      ZooKeeper zk = new ZooKeeper(quorumServers, sessionTimeout, EmptyWatcher.instance,
sessionID, password);
-      zk.close();
+  public void testRegionServerSessionExpired() throws Exception{
+    this.conf.setBoolean("hbase.regionserver.restart.on.zk.expire", true);
+    new HTable(conf, HConstants.META_TABLE_NAME);
+    HRegionServer rs = cluster.getRegionServer(0);
+    sessionExpirationHelper(rs.getZooKeeperWrapper());
+  }
 
-      Thread.sleep(sessionTimeout * 3);
+  public void testMasterSessionExpired() throws Exception {
+    new HTable(conf, HConstants.META_TABLE_NAME);
+    HMaster master = cluster.getMaster();
+    sessionExpirationHelper(master.getZooKeeperWrapper());
+  }
+
+  public void sessionExpirationHelper(ZooKeeperWrapper nodeZK) throws Exception{
+    ZooKeeperWrapper zkw = new ZooKeeperWrapper(conf, EmptyWatcher.instance);
+    String quorumServers = zkw.getQuorumServers();
+    int sessionTimeout = 5 * 1000; // 5 seconds
+
+    byte[] password = nodeZK.getSessionPassword();
+    long sessionID = nodeZK.getSessionID();
+
+    ZooKeeper zk = new ZooKeeper(quorumServers,
+        sessionTimeout, EmptyWatcher.instance, sessionID, password);
+
+    zk.close();
+
+    Thread.sleep(sessionTimeout * 3L);
+
+    new HTable(conf, HConstants.META_TABLE_NAME);
+
+    HBaseAdmin admin = new HBaseAdmin(conf);
+    HTableDescriptor desc = new HTableDescriptor("test");
+    HColumnDescriptor family = new HColumnDescriptor("fam");
+    desc.addFamily(family);
+    admin.createTable(desc);
+
+    HTable table = new HTable("test");
+    Put put = new Put(Bytes.toBytes("testrow"));
+    put.add(Bytes.toBytes("fam"), Bytes.toBytes("col"), Bytes.toBytes("testdata"));
+    table.put(put);
 
-      new HTable(conf, HConstants.META_TABLE_NAME);
-  
-      HBaseAdmin admin = new HBaseAdmin(conf);
-      HTableDescriptor desc = new HTableDescriptor("test");
-      HColumnDescriptor family = new HColumnDescriptor("fam:");
-      desc.addFamily(family);
-      admin.createTable(desc);
-  
-      HTable table = new HTable("test");
-      Put put = new Put(Bytes.toBytes("testrow"));
-      put.add(Bytes.toBytes("fam"), Bytes.toBytes("col"), Bytes.toBytes("testdata"));
-      table.put(put);
-    } catch (Exception e) {
-      e.printStackTrace();
-      fail();
-    }
   }
   
   public void testMultipleZK() {



Mime
View raw message