Return-Path: Delivered-To: apmail-hadoop-hbase-commits-archive@minotaur.apache.org Received: (qmail 30404 invoked from network); 26 May 2009 18:02:17 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.3) by minotaur.apache.org with SMTP; 26 May 2009 18:02:17 -0000 Received: (qmail 42887 invoked by uid 500); 26 May 2009 18:02:29 -0000 Delivered-To: apmail-hadoop-hbase-commits-archive@hadoop.apache.org Received: (qmail 42856 invoked by uid 500); 26 May 2009 18:02:29 -0000 Mailing-List: contact hbase-commits-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: hbase-dev@hadoop.apache.org Delivered-To: mailing list hbase-commits@hadoop.apache.org Received: (qmail 42847 invoked by uid 99); 26 May 2009 18:02:29 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 26 May 2009 18:02:29 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=10.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 26 May 2009 18:02:18 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id D29772388858; Tue, 26 May 2009 18:01:57 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r778819 - in /hadoop/hbase/trunk: ./ src/java/org/apache/hadoop/hbase/client/ src/java/org/apache/hadoop/hbase/ipc/ src/java/org/apache/hadoop/hbase/master/ src/java/org/apache/hadoop/hbase/regionserver/ src/java/org/apache/hadoop/hbase/zoo... Date: Tue, 26 May 2009 18:01:57 -0000 To: hbase-commits@hadoop.apache.org From: jdcryans@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20090526180157.D29772388858@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: jdcryans Date: Tue May 26 18:01:56 2009 New Revision: 778819 URL: http://svn.apache.org/viewvc?rev=778819&view=rev Log: HBASE-1302 When a new master comes up, regionservers should continue with their region assignments from the last master Modified: hadoop/hbase/trunk/CHANGES.txt hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/client/HConnection.java hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/client/HConnectionManager.java hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/ipc/HBaseRPCProtocolVersion.java hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/ipc/HRegionInterface.java hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/HMaster.java hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/RegionManager.java hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ServerManager.java hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWrapper.java Modified: hadoop/hbase/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/CHANGES.txt?rev=778819&r1=778818&r2=778819&view=diff ============================================================================== --- hadoop/hbase/trunk/CHANGES.txt (original) +++ hadoop/hbase/trunk/CHANGES.txt Tue May 26 18:01:56 2009 @@ -154,6 +154,8 @@ localhost_1237525439599_56094" <- You'd have to be perverse to recognize that as a hostname, startcode, and port HBASE-1395 InfoServers no longer put up a UI + HBASE-1302 When a new master comes up, regionservers should continue with + their region assignments from the last master IMPROVEMENTS HBASE-1089 Add count of regions on filesystem to master UI; add percentage Modified: hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/client/HConnection.java URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/client/HConnection.java?rev=778819&r1=778818&r2=778819&view=diff ============================================================================== --- hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/client/HConnection.java (original) +++ hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/client/HConnection.java Tue May 26 18:01:56 2009 @@ -133,6 +133,17 @@ public HRegionInterface getHRegionConnection(HServerAddress regionServer) throws IOException; + /** + * Establishes a connection to the region server at the specified address. + * @param regionServer - the server to connect to + * @param getMaster - do we check if master is alive + * @return proxy for HRegionServer + * @throws IOException + */ + public HRegionInterface getHRegionConnection( + HServerAddress regionServer, boolean getMaster) + throws IOException; + /** * Find region location hosting passed row * @param tableName Modified: hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/client/HConnectionManager.java URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/client/HConnectionManager.java?rev=778819&r1=778818&r2=778819&view=diff ============================================================================== --- hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/client/HConnectionManager.java (original) +++ hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/client/HConnectionManager.java Tue May 26 18:01:56 2009 @@ -116,6 +116,7 @@ } } + /* Encapsulates finding the servers for an HBase instance */ private static class TableServers implements ServerConnection, HConstants, Watcher { private static final Log LOG = LogFactory.getLog(TableServers.class); @@ -766,9 +767,12 @@ tableLocations.put(startKey, location); } - public HRegionInterface getHRegionConnection(HServerAddress regionServer) + public HRegionInterface getHRegionConnection( + HServerAddress regionServer, boolean getMaster) throws IOException { - getMaster(); + if(getMaster) { + getMaster(); + } HRegionInterface server; synchronized (this.servers) { // See if we already have a connection @@ -787,6 +791,12 @@ } return server; } + + public HRegionInterface getHRegionConnection( + HServerAddress regionServer) + throws IOException { + return getHRegionConnection(regionServer, true); + } public synchronized ZooKeeperWrapper getZooKeeperWrapper() throws IOException { if (zooKeeperWrapper == null) { Modified: hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/ipc/HBaseRPCProtocolVersion.java URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/ipc/HBaseRPCProtocolVersion.java?rev=778819&r1=778818&r2=778819&view=diff ============================================================================== --- hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/ipc/HBaseRPCProtocolVersion.java (original) +++ hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/ipc/HBaseRPCProtocolVersion.java Tue May 26 18:01:56 2009 @@ -69,7 +69,8 @@ * HMasterInterface.findRootRegion. We use ZooKeeper to store root region * location instead. *
  • Version 17: Added incrementColumnValue.
  • + *
  • Version 18: HBASE-1302.
  • * */ - public static final long versionID = 17L; + public static final long versionID = 18L; } Modified: hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/ipc/HRegionInterface.java URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/ipc/HRegionInterface.java?rev=778819&r1=778818&r2=778819&view=diff ============================================================================== --- hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/ipc/HRegionInterface.java (original) +++ hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/ipc/HRegionInterface.java Tue May 26 18:01:56 2009 @@ -28,6 +28,7 @@ import org.apache.hadoop.hbase.io.HbaseMapWritable; import org.apache.hadoop.hbase.HRegionInfo; +import org.apache.hadoop.hbase.HServerInfo; import org.apache.hadoop.hbase.NotServingRegionException; /** @@ -306,4 +307,18 @@ */ public long incrementColumnValue(byte [] regionName, byte [] row, byte [] column, long amount) throws IOException; + + /** + * Method used when a master is taking the place of another failed one. + * @return All regions assigned on this region server + * @throws IOException + */ + public HRegionInfo[] getRegionsAssignment() throws IOException; + + /** + * Method used when a master is taking the place of another failed one. + * @return The HSI + * @throws IOException + */ + public HServerInfo getHServerInfo() throws IOException; } Modified: hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/HMaster.java URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/HMaster.java?rev=778819&r1=778818&r2=778819&view=diff ============================================================================== --- hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/HMaster.java (original) +++ hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/HMaster.java Tue May 26 18:01:56 2009 @@ -25,6 +25,7 @@ import java.lang.reflect.Constructor; import java.net.InetAddress; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Random; @@ -44,6 +45,7 @@ import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.HMsg; import org.apache.hadoop.hbase.HRegionInfo; +import org.apache.hadoop.hbase.HRegionLocation; import org.apache.hadoop.hbase.HServerAddress; import org.apache.hadoop.hbase.HServerInfo; import org.apache.hadoop.hbase.HServerLoad; @@ -374,6 +376,7 @@ public void run() { final String threadName = "HMaster"; Thread.currentThread().setName(threadName); + verifyClusterState(); startServiceThreads(); /* Main processing loop */ try { @@ -504,6 +507,61 @@ } /* + * Verifies if this instance of HBase is fresh or the master was started + * following a failover. In the second case, it inspects the region server + * directory and gets their regions assignment. + */ + private void verifyClusterState() { + try { + LOG.debug("Checking cluster state..."); + HServerAddress rootLocation = zooKeeperWrapper.readRootRegionLocation(); + List addresses = zooKeeperWrapper.scanRSDirectory(); + + // Check if this is a fresh start of the cluster + if(addresses.size() == 0) { + LOG.debug("This is a fresh start, proceeding with normal startup"); + return; + } + LOG.info("This is a failover, ZK inspection begins..."); + boolean isRootRegionAssigned = false; + Map assignedRegions = + new HashMap(); + // This is a failover case. We must: + // - contact every region server to add them to the regionservers list + // - get their current regions assignment + for (HServerAddress address : addresses) { + HRegionInterface hri = + this.connection.getHRegionConnection(address, false); + HServerInfo info = hri.getHServerInfo(); + LOG.debug("Inspection found server " + info.getName()); + serverManager.recordNewServer(info); + HRegionInfo[] regions = hri.getRegionsAssignment(); + for (HRegionInfo region : regions) { + if(region.isRootRegion()) { + connection.setRootRegionLocation( + new HRegionLocation(region, rootLocation)); + regionManager.setRootRegionLocation(rootLocation); + // Undo the unassign work in the RegionManager constructor + regionManager.removeRegion(region); + isRootRegionAssigned = true; + } + else if(region.isMetaRegion()) { + MetaRegion m = + new MetaRegion(new HServerAddress(address), + region.getRegionName(), region.getStartKey()); + regionManager.addMetaRegionToScan(m); + } + assignedRegions.put(region.getRegionName(), region); + } + } + LOG.info("Inspection found " + assignedRegions.size() + " regions, " + + (isRootRegionAssigned ? "with -ROOT-" : "but -ROOT- was MIA")); + } catch(IOException ex) { + ex.printStackTrace(); + } + } + + /* * Start up all services. If any of these threads gets an unhandled exception * then they just die with a logged message. This should be fine because * in general, we do not expect the master to get such unhandled exceptions Modified: hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/RegionManager.java URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/RegionManager.java?rev=778819&r1=778818&r2=778819&view=diff ============================================================================== --- hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/RegionManager.java (original) +++ hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/RegionManager.java Tue May 26 18:01:56 2009 @@ -554,6 +554,7 @@ } catch(Exception iex) { LOG.warn("meta scanner", iex); } + zooKeeperWrapper.clearRSDirectory(); zooKeeperWrapper.close(); } Modified: hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ServerManager.java URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ServerManager.java?rev=778819&r1=778818&r2=778819&view=diff ============================================================================== --- hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ServerManager.java (original) +++ hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ServerManager.java Tue May 26 18:01:56 2009 @@ -161,8 +161,6 @@ LOG.debug("deadServers.contains: " + deadServers.contains(serverName)); throw new Leases.LeaseStillHeldException(serverName); } - Watcher watcher = new ServerExpirer(serverName, info.getServerAddress()); - zooKeeperWrapper.updateRSLocationGetWatch(info, watcher); LOG.info("Received start message from: " + serverName); // Go on to process the regionserver registration. @@ -198,9 +196,21 @@ LOG.error("Insertion into toDoQueue was interrupted", e); } } - // record new server - load = new HServerLoad(); + recordNewServer(info); + } + + /** + * Adds the HSI to the RS list + * @param info The region server informations + */ + public void recordNewServer(HServerInfo info) { + HServerLoad load = new HServerLoad(); + String serverName = HServerInfo.getServerName(info); info.setLoad(load); + // We must set this watcher here because it can be set on a fresh start + // or on a failover + Watcher watcher = new ServerExpirer(serverName, info.getServerAddress()); + zooKeeperWrapper.updateRSLocationGetWatch(info, watcher); serversToServerInfo.put(serverName, info); serverAddressToServerInfo.put(info.getServerAddress(), info); serversToLoad.put(serverName, load); Modified: hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java?rev=778819&r1=778818&r2=778819&view=diff ============================================================================== --- hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (original) +++ hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java Tue May 26 18:01:56 2009 @@ -323,15 +323,6 @@ private void reinitializeZooKeeper() throws IOException { zooKeeperWrapper = new ZooKeeperWrapper(conf); watchMasterAddress(); - - boolean startCodeOk = false; - while(!startCodeOk) { - serverInfo.setStartCode(System.currentTimeMillis()); - startCodeOk = zooKeeperWrapper.writeRSLocation(serverInfo); - if(!startCodeOk) { - LOG.debug("Start code already taken, trying another one"); - } - } } private void reinitializeThreads() { @@ -384,6 +375,8 @@ if (state == KeeperState.Expired) { LOG.error("ZooKeeper session expired"); restart(); + } else if (type == EventType.NodeDeleted) { + watchMasterAddress(); } else if (type == EventType.NodeCreated) { getMaster(); @@ -1330,6 +1323,14 @@ if (LOG.isDebugEnabled()) LOG.debug("sending initial server load: " + hsl); lastMsg = System.currentTimeMillis(); + boolean startCodeOk = false; + while(!startCodeOk) { + serverInfo.setStartCode(System.currentTimeMillis()); + startCodeOk = zooKeeperWrapper.writeRSLocation(serverInfo); + if(!startCodeOk) { + LOG.debug("Start code already taken, trying another one"); + } + } result = this.hbaseMaster.regionServerStartup(serverInfo); break; } catch (Leases.LeaseStillHeldException e) { @@ -2451,7 +2452,20 @@ checkFileSystem(); throw e; } - - + } + + /** {@inheritDoc} */ + public HRegionInfo[] getRegionsAssignment() throws IOException { + HRegionInfo[] regions = new HRegionInfo[onlineRegions.size()]; + Iterator ite = onlineRegions.values().iterator(); + for(int i = 0; ite.hasNext(); i++) { + regions[i] = ite.next().getRegionInfo(); + } + return regions; + } + + /** {@inheritDoc} */ + public HServerInfo getHServerInfo() throws IOException { + return serverInfo; } } Modified: hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWrapper.java URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWrapper.java?rev=778819&r1=778818&r2=778819&view=diff ============================================================================== --- hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWrapper.java (original) +++ hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWrapper.java Tue May 26 18:01:56 2009 @@ -462,12 +462,12 @@ */ public boolean writeRSLocation(HServerInfo info) { ensureExists(rsZNode); - byte[] data = Bytes.toBytes(info.getServerAddress().getBindAddress()); + byte[] data = Bytes.toBytes(info.getServerAddress().toString()); String znode = joinPath(rsZNode, Long.toString(info.getStartCode())); try { zooKeeper.create(znode, data, Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL); LOG.debug("Created ZNode " + znode - + " with data " + info.getServerAddress().getBindAddress()); + + " with data " + info.getServerAddress().toString()); return true; } catch (KeeperException e) { LOG.warn("Failed to create " + znode + " znode in ZooKeeper: " + e); @@ -484,12 +484,12 @@ * @return true if the update is done, false if it failed */ public boolean updateRSLocationGetWatch(HServerInfo info, Watcher watcher) { - byte[] data = Bytes.toBytes(info.getServerAddress().getBindAddress()); - String znode = rsZNode + "/" + info.getStartCode(); + byte[] data = Bytes.toBytes(info.getServerAddress().toString()); + String znode = rsZNode + ZNODE_PATH_SEPARATOR + info.getStartCode(); try { zooKeeper.setData(znode, data, -1); LOG.debug("Updated ZNode " + znode - + " with data " + info.getServerAddress().getBindAddress()); + + " with data " + info.getServerAddress().toString()); zooKeeper.getData(znode, watcher, null); return true; } catch (KeeperException e) { @@ -501,6 +501,43 @@ return false; } + /** + * Scans the regions servers directory + * @return A list of server addresses + */ + public List scanRSDirectory() { + List addresses = new ArrayList(); + try { + List nodes = zooKeeper.getChildren(rsZNode, false); + for (String node : nodes) { + addresses.add(readAddress(rsZNode + ZNODE_PATH_SEPARATOR + node, null)); + } + } catch (KeeperException e) { + LOG.warn("Failed to read " + rsZNode + " znode in ZooKeeper: " + e); + } catch (InterruptedException e) { + LOG.warn("Failed to read " + rsZNode + " znode in ZooKeeper: " + e); + } + return addresses; + } + + /** + * Method used to make sure the region server directory is empty. + * + */ + public void clearRSDirectory() { + try { + List nodes = zooKeeper.getChildren(rsZNode, false); + for (String node : nodes) { + LOG.debug("Deleting node: " + node); + zooKeeper.delete(node, -1); + } + } catch (KeeperException e) { + LOG.warn("Failed to delete " + rsZNode + " znode in ZooKeeper: " + e); + } catch (InterruptedException e) { + LOG.warn("Failed to delete " + rsZNode + " znode in ZooKeeper: " + e); + } + } + private boolean checkExistenceOf(String path) { Stat stat = null; try {