Return-Path: Delivered-To: apmail-hadoop-hbase-commits-archive@minotaur.apache.org Received: (qmail 6960 invoked from network); 31 May 2009 16:28:26 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.3) by minotaur.apache.org with SMTP; 31 May 2009 16:28:26 -0000 Received: (qmail 50081 invoked by uid 500); 31 May 2009 16:28:38 -0000 Delivered-To: apmail-hadoop-hbase-commits-archive@hadoop.apache.org Received: (qmail 50034 invoked by uid 500); 31 May 2009 16:28:38 -0000 Mailing-List: contact hbase-commits-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: hbase-dev@hadoop.apache.org Delivered-To: mailing list hbase-commits@hadoop.apache.org Received: (qmail 50025 invoked by uid 99); 31 May 2009 16:28:38 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Sun, 31 May 2009 16:28:38 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=10.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Sun, 31 May 2009 16:28:26 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id 0AB9C238886D; Sun, 31 May 2009 16:28:05 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r780467 - in /hadoop/hbase/branches/0.19: ./ lib/ src/java/org/apache/hadoop/hbase/client/ src/java/org/apache/hadoop/hbase/ipc/ src/java/org/apache/hadoop/hbase/master/ Date: Sun, 31 May 2009 16:28:04 -0000 To: hbase-commits@hadoop.apache.org From: stack@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20090531162805.0AB9C238886D@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: stack Date: Sun May 31 16:28:03 2009 New Revision: 780467 URL: http://svn.apache.org/viewvc?rev=780467&view=rev Log: HBASE-1457 Taking down ROOT/META regionserver can result in cluster becoming in-operational Removed: hadoop/hbase/branches/0.19/lib/zookeeper-3.0.1.jar Modified: hadoop/hbase/branches/0.19/CHANGES.txt hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/client/HConnectionManager.java hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/ipc/HBaseClient.java hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/HMaster.java hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/MetaRegion.java hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/ProcessRegionOpen.java hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/ProcessRegionStatusChange.java hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/RegionManager.java hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/RegionServerOperation.java hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/RetryableMetaOperation.java hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/RootScanner.java hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/ServerManager.java Modified: hadoop/hbase/branches/0.19/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.19/CHANGES.txt?rev=780467&r1=780466&r2=780467&view=diff ============================================================================== --- hadoop/hbase/branches/0.19/CHANGES.txt (original) +++ hadoop/hbase/branches/0.19/CHANGES.txt Sun May 31 16:28:03 2009 @@ -2,6 +2,8 @@ Release 0.19.4 - Unreleased BUG FIXES HBASE-1446 2 javdoc build warning + HBASE-1457 Taking down ROOT/META regionserver can result in cluster + becoming in-operational (Ryan Rawson via Stack) Release 0.19.3 - May 27th, 2009 BUG FIXES Modified: hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/client/HConnectionManager.java URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/client/HConnectionManager.java?rev=780467&r1=780466&r2=780467&view=diff ============================================================================== --- hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/client/HConnectionManager.java (original) +++ hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/client/HConnectionManager.java Sun May 31 16:28:03 2009 @@ -304,7 +304,7 @@ rowResult.get(COL_REGIONINFO)); // Only examine the rows where the startKey is zero length - if (info.getStartKey().length == 0) { + if (info != null && info.getStartKey().length == 0) { uniqueTables.add(info.getTableDesc()); } return true; Modified: hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/ipc/HBaseClient.java URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/ipc/HBaseClient.java?rev=780467&r1=780466&r2=780467&view=diff ============================================================================== --- hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/ipc/HBaseClient.java (original) +++ hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/ipc/HBaseClient.java Sun May 31 16:28:03 2009 @@ -65,7 +65,7 @@ public class HBaseClient { public static final Log LOG = - LogFactory.getLog("org.apache.hadoop.ipc.HBaseClass"); + LogFactory.getLog("org.apache.hadoop.ipc.HBaseClient"); private Hashtable connections = new Hashtable(); @@ -841,4 +841,4 @@ return address.hashCode() ^ System.identityHashCode(ticket); } } -} \ No newline at end of file +} Modified: hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/HMaster.java URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/HMaster.java?rev=780467&r1=780466&r2=780467&view=diff ============================================================================== --- hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/HMaster.java (original) +++ hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/HMaster.java Sun May 31 16:28:03 2009 @@ -32,6 +32,7 @@ import java.util.concurrent.DelayQueue; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; +import java.util.concurrent.PriorityBlockingQueue; import java.util.concurrent.atomic.AtomicBoolean; import org.apache.commons.logging.Log; @@ -118,7 +119,7 @@ volatile DelayQueue delayedToDoQueue = new DelayQueue(); volatile BlockingQueue toDoQueue = - new LinkedBlockingQueue(); + new PriorityBlockingQueue(); private final HBaseServer server; private final HServerAddress address; @@ -233,6 +234,9 @@ this.address = new HServerAddress(server.getListenerAddress()); conf.set(MASTER_ADDRESS, address.toString()); + // dont retry too much + conf.setInt("hbase.client.retries.number", 3); + this.connection = ServerConnectionManager.getConnection(conf); this.metaRescanInterval = @@ -476,15 +480,7 @@ return false; } LOG.warn("Processing pending operations: " + op.toString(), ex); - try { - // put the operation back on the queue... maybe it'll work next time. - toDoQueue.put(op); - } catch (InterruptedException e) { - throw new RuntimeException( - "Putting into toDoQueue was interrupted.", e); - } catch (Exception e) { - LOG.error("main processing loop: " + op.toString(), e); - } + delayedToDoQueue.put(op); } return true; } Modified: hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/MetaRegion.java URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/MetaRegion.java?rev=780467&r1=780466&r2=780467&view=diff ============================================================================== --- hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/MetaRegion.java (original) +++ hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/MetaRegion.java Sun May 31 16:28:03 2009 @@ -19,46 +19,37 @@ */ package org.apache.hadoop.hbase.master; -import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.HServerAddress; -import org.apache.hadoop.hbase.HStoreKey; import org.apache.hadoop.hbase.util.Bytes; /** Describes a meta region and its server */ public class MetaRegion implements Comparable { private final HServerAddress server; - private final byte [] regionName; - private final byte [] startKey; + private final HRegionInfo regionInfo; - MetaRegion(final HServerAddress server, final byte [] regionName) { - this (server, regionName, HConstants.EMPTY_START_ROW); - } - - MetaRegion(final HServerAddress server, final byte [] regionName, - final byte [] startKey) { + MetaRegion(final HServerAddress server, HRegionInfo regionInfo) { if (server == null) { throw new IllegalArgumentException("server cannot be null"); } this.server = server; - if (regionName == null) { - throw new IllegalArgumentException("regionName cannot be null"); + if (regionInfo == null) { + throw new IllegalArgumentException("regionInfo cannot be null"); } - this.regionName = regionName; - this.startKey = startKey; + this.regionInfo = regionInfo; } @Override public String toString() { - return "{regionname: " + Bytes.toString(this.regionName) + - ", startKey: <" + Bytes.toString(this.startKey) + - ">, server: " + this.server.toString() + "}"; + return "{server: " + this.server.toString() + ", regionname: " + + regionInfo.getRegionNameAsString() + ", startKey: <" + + Bytes.toString(regionInfo.getStartKey()) + ">}"; } /** @return the regionName */ public byte [] getRegionName() { - return regionName; + return regionInfo.getRegionName(); } /** @return the server */ @@ -68,7 +59,11 @@ /** @return the startKey */ public byte [] getStartKey() { - return startKey; + return regionInfo.getStartKey(); + } + + public HRegionInfo getRegionInfo() { + return regionInfo; } @Override @@ -78,23 +73,17 @@ @Override public int hashCode() { - int result = this.regionName.hashCode(); - result ^= this.startKey.hashCode(); - return result; + return regionInfo.hashCode(); } // Comparable public int compareTo(MetaRegion other) { - int result = Bytes.compareTo(this.regionName, other.getRegionName()); - if(result == 0) { - result = HStoreKey.compareTwoRowKeys(HRegionInfo.FIRST_META_REGIONINFO, - this.startKey, other.getStartKey()); - if (result == 0) { - // Might be on different host? - result = this.server.compareTo(other.server); - } + int cmp = regionInfo.compareTo(other.regionInfo); + if (cmp == 0) { + // Might be on different host? + cmp = this.server.compareTo(other.server); } - return result; + return cmp; } -} \ No newline at end of file +} Modified: hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/ProcessRegionOpen.java URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/ProcessRegionOpen.java?rev=780467&r1=780466&r2=780467&view=diff ============================================================================== --- hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/ProcessRegionOpen.java (original) +++ hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/ProcessRegionOpen.java Sun May 31 16:28:03 2009 @@ -25,6 +25,7 @@ import org.apache.hadoop.hbase.HServerAddress; import org.apache.hadoop.hbase.HServerInfo; import org.apache.hadoop.hbase.RegionHistorian; +import org.apache.hadoop.hbase.ipc.HRegionInterface; import org.apache.hadoop.hbase.io.BatchUpdate; import org.apache.hadoop.hbase.util.Bytes; @@ -61,62 +62,67 @@ @Override protected boolean process() throws IOException { - Boolean result = - new RetryableMetaOperation(getMetaRegion(), this.master) { - private final RegionHistorian historian = RegionHistorian.getInstance(); - - public Boolean call() throws IOException { - LOG.info(regionInfo.getRegionNameAsString() + " open on " + - serverAddress.toString()); - if (!metaRegionAvailable()) { - // We can't proceed unless the meta region we are going to update - // is online. metaRegionAvailable() has put this operation on the - // delayedToDoQueue, so return true so the operation is not put - // back on the toDoQueue - return true; - } + if (!metaRegionAvailable()) { + // We can't proceed unless the meta region we are going to update + // is online. metaRegionAvailable() has put this operation on the + // delayedToDoQueue, so return true so the operation is not put + // back on the toDoQueue + return true; + } + + final RegionHistorian historian = RegionHistorian.getInstance(); + HRegionInterface server = + master.connection.getHRegionConnection(getMetaRegion().getServer()); + LOG.info(regionInfo.getRegionNameAsString() + " open on " + + this.serverAddress.toString()); - // Register the newly-available Region's location. - LOG.info("updating row " + regionInfo.getRegionNameAsString() + - " in region " + Bytes.toString(metaRegionName) + - " with startcode " + Bytes.toLong(startCode) + " and server " + - serverAddress.toString()); - BatchUpdate b = new BatchUpdate(regionInfo.getRegionName()); - b.put(COL_SERVER, Bytes.toBytes(serverAddress.toString())); - b.put(COL_STARTCODE, startCode); - server.batchUpdate(metaRegionName, b, -1L); - if (!this.historian.isOnline()) { - // This is safest place to do the onlining of the historian in - // the master. When we get to here, we know there is a .META. - // for the historian to go against. - this.historian.online(this.master.getConfiguration()); - } - this.historian.addRegionOpen(regionInfo, serverAddress); - this.historian.getRegionHistory("dummy"); - synchronized (master.regionManager) { - if (isMetaTable) { - // It's a meta region. - MetaRegion m = new MetaRegion(new HServerAddress(serverAddress), - regionInfo.getRegionName(), regionInfo.getStartKey()); - if (!master.regionManager.isInitialMetaScanComplete()) { - // Put it on the queue to be scanned for the first time. - LOG.debug("Adding " + m.toString() + " to regions to scan"); - master.regionManager.addMetaRegionToScan(m); - } else { - // Add it to the online meta regions - LOG.debug("Adding to onlineMetaRegions: " + m.toString()); - master.regionManager.putMetaRegionOnline(m); - // Interrupting the Meta Scanner sleep so that it can - // process regions right away - master.regionManager.metaScannerThread.interrupt(); - } + // Register the newly-available Region's location. + LOG.info("updating row " + regionInfo.getRegionNameAsString() + + " in region " + Bytes.toString(metaRegionName) + " with " + + " with startcode " + Bytes.toString(this.startCode) + " and server " + + this.serverAddress); + BatchUpdate b = new BatchUpdate(regionInfo.getRegionName()); + b.put(COL_SERVER, + Bytes.toBytes(this.serverAddress.toString())); + b.put(COL_STARTCODE, this.startCode); + server.batchUpdate(metaRegionName, b, -1L); + if (!historian.isOnline()) { + // This is safest place to do the onlining of the historian in + // the master. When we get to here, we know there is a .META. + // for the historian to go against. + historian.online(this.master.getConfiguration()); + } + historian.addRegionOpen(regionInfo, this.serverAddress); + synchronized (master.regionManager) { + if (isMetaTable) { + // It's a meta region. + MetaRegion m = + new MetaRegion(new HServerAddress(this.serverAddress), regionInfo); + if (!master.regionManager.isInitialMetaScanComplete()) { + // Put it on the queue to be scanned for the first time. + if (LOG.isDebugEnabled()) { + LOG.debug("Adding " + m.toString() + " to regions to scan"); + } + master.regionManager.addMetaRegionToScan(m); + } else { + // Add it to the online meta regions + if (LOG.isDebugEnabled()) { + LOG.debug("Adding to onlineMetaRegions: " + m.toString()); } - // If updated successfully, remove from pending list. - master.regionManager.removeRegion(regionInfo); - return true; + master.regionManager.putMetaRegionOnline(m); + // Interrupting the Meta Scanner sleep so that it can + // process regions right away + master.regionManager.metaScannerThread.interrupt(); } - } - }.doWithRetries(); - return result == null ? true : result; + } + // If updated successfully, remove from pending list. + master.regionManager.removeRegion(regionInfo); + return true; + } + } + + @Override + protected int getPriority() { + return 0; // highest priority } } Modified: hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/ProcessRegionStatusChange.java URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/ProcessRegionStatusChange.java?rev=780467&r1=780466&r2=780467&view=diff ============================================================================== --- hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/ProcessRegionStatusChange.java (original) +++ hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/ProcessRegionStatusChange.java Sun May 31 16:28:03 2009 @@ -47,6 +47,7 @@ if (isMetaTable) { // This operation is for the meta table if (!rootAvailable()) { + requeue(); // But we can't proceed unless the root region is available available = false; } @@ -67,7 +68,7 @@ if (isMetaTable) { this.metaRegionName = HRegionInfo.ROOT_REGIONINFO.getRegionName(); this.metaRegion = new MetaRegion(master.getRootRegionLocation(), - this.metaRegionName, HConstants.EMPTY_START_ROW); + HRegionInfo.ROOT_REGIONINFO); } else { this.metaRegion = master.regionManager.getFirstMetaRegionForRegion(regionInfo); Modified: hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java?rev=780467&r1=780466&r2=780467&view=diff ============================================================================== --- hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java (original) +++ hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java Sun May 31 16:28:03 2009 @@ -31,12 +31,12 @@ import org.apache.hadoop.hbase.HServerAddress; import org.apache.hadoop.hbase.HServerInfo; import org.apache.hadoop.hbase.RemoteExceptionHandler; +import org.apache.hadoop.hbase.io.RowResult; import org.apache.hadoop.hbase.ipc.HRegionInterface; import org.apache.hadoop.hbase.regionserver.HLog; import org.apache.hadoop.hbase.regionserver.HRegion; -import org.apache.hadoop.hbase.util.Writables; import org.apache.hadoop.hbase.util.Bytes; -import org.apache.hadoop.hbase.io.RowResult; +import org.apache.hadoop.hbase.util.Writables; /** * Instantiated when a server's lease has expired, meaning it has crashed. @@ -49,11 +49,12 @@ * Cache of the server name. */ private final String deadServerStr; - private final boolean rootRegionServer; - private boolean rootRegionReassigned = false; + private boolean isRootServer; + private List metaRegions; private Path oldLogDir; private boolean logSplit; private boolean rootRescanned; + private HServerAddress deadServerAddress; private class ToDoEntry { @@ -71,18 +72,34 @@ /** * @param master * @param serverInfo - * @param rootRegionServer */ - public ProcessServerShutdown(HMaster master, HServerInfo serverInfo, - boolean rootRegionServer) { + public ProcessServerShutdown(HMaster master, HServerInfo serverInfo) { super(master); this.deadServer = serverInfo.getServerAddress(); + this.deadServerAddress = serverInfo.getServerAddress(); this.deadServerStr = this.deadServer.toString(); - this.rootRegionServer = rootRegionServer; this.logSplit = false; this.rootRescanned = false; this.oldLogDir = new Path(master.rootdir, HLog.getHLogDirectoryName(serverInfo)); + + // check to see if I am responsible for either ROOT or any of the META tables. + + closeMetaRegions(); + } + + private void closeMetaRegions() { + isRootServer = master.regionManager.isRootServer(deadServerAddress); + if (isRootServer) { + master.regionManager.unsetRootRegion(); + } + List metaStarts = master.regionManager.listMetaRegionsForServer(deadServerAddress); + + metaRegions = new ArrayList(); + for (byte [] region : metaStarts) { + MetaRegion r = master.regionManager.offlineMetaRegion(region); + metaRegions.add(r); + } } @Override @@ -255,16 +272,22 @@ logSplit = true; } - if (this.rootRegionServer && !this.rootRegionReassigned) { - // avoid multiple root region reassignment - this.rootRegionReassigned = true; - // The server that died was serving the root region. Now that the log - // has been split, get it reassigned. + LOG.info("Log split complete, meta reassignment and scanning:"); + + if (this.isRootServer) { + LOG.info("ProcessServerShutdown reassigning ROOT region"); master.regionManager.reassignRootRegion(); - // When we call rootAvailable below, it will put us on the delayed - // to do queue to allow some time to pass during which the root - // region will hopefully get reassigned. + + isRootServer = false; // prevent double reassignment... heh. + } + + for (MetaRegion metaRegion : metaRegions) { + LOG.info("ProcessServerShutdown setting to unassigned: " + metaRegion.toString()); + master.regionManager.setUnassigned(metaRegion.getRegionInfo(), true); } + // one the meta regions are online, "forget" about them. Since there are explicit + // checks below to make sure meta/root are online, this is likely to occur. + metaRegions.clear(); if (!rootAvailable()) { // Return true so that worker does not put this request back on the @@ -277,8 +300,7 @@ // Scan the ROOT region Boolean result = new ScanRootRegion( new MetaRegion(master.getRootRegionLocation(), - HRegionInfo.ROOT_REGIONINFO.getRegionName(), - HConstants.EMPTY_START_ROW), this.master).doWithRetries(); + HRegionInfo.ROOT_REGIONINFO), this.master).doWithRetries(); if (result == null) { // Master is closing - give up return true; @@ -316,4 +338,9 @@ } return true; } -} \ No newline at end of file + + @Override + protected int getPriority() { + return 2; // high but not highest priority + } +} Modified: hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/RegionManager.java URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/RegionManager.java?rev=780467&r1=780466&r2=780467&view=diff ============================================================================== --- hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/RegionManager.java (original) +++ hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/RegionManager.java Sun May 31 16:28:03 2009 @@ -94,7 +94,7 @@ * * @see RegionState inner-class below */ - private final SortedMap regionsInTransition = + final SortedMap regionsInTransition = Collections.synchronizedSortedMap( new TreeMap(Bytes.BYTES_COMPARATOR)); @@ -154,6 +154,7 @@ synchronized (regionsInTransition) { rootRegionLocation.set(null); regionsInTransition.remove(HRegionInfo.ROOT_REGIONINFO.getRegionName()); + LOG.info("-ROOT- region unset (but not set to be reassigned)"); } } @@ -164,6 +165,7 @@ RegionState s = new RegionState(HRegionInfo.ROOT_REGIONINFO); s.setUnassigned(); regionsInTransition.put(HRegionInfo.ROOT_REGIONINFO.getRegionName(), s); + LOG.info("ROOT inserted into regionsInTransition"); } } } @@ -180,9 +182,12 @@ void assignRegions(HServerInfo info, String serverName, HRegionInfo[] mostLoadedRegions, ArrayList returnMsgs) { HServerLoad thisServersLoad = info.getLoad(); + boolean isSingleServer = master.serverManager.numServers() == 1; + // figure out what regions need to be assigned and aren't currently being // worked on elsewhere. - Set regionsToAssign = regionsAwaitingAssignment(); + Set regionsToAssign = regionsAwaitingAssignment(info.getServerAddress(), + isSingleServer); if (regionsToAssign.size() == 0) { // There are no regions waiting to be assigned. if (!inSafeMode()) { @@ -205,7 +210,7 @@ } } else { // if there's only one server, just give it all the regions - if (master.serverManager.numServers() == 1) { + if (isSingleServer) { assignRegionsToOneServer(regionsToAssign, serverName, returnMsgs); } else { // otherwise, give this server a few regions taking into account the @@ -226,11 +231,21 @@ private void assignRegionsToMultipleServers(final HServerLoad thisServersLoad, final Set regionsToAssign, final String serverName, final ArrayList returnMsgs) { - + + boolean isMetaAssign = false; + for (RegionState s : regionsToAssign) { + if (s.getRegionInfo().isMetaRegion()) + isMetaAssign = true; + } + int nRegionsToAssign = regionsToAssign.size(); int nregions = regionsPerServer(nRegionsToAssign, thisServersLoad); + LOG.debug("multi assing for " + serverName + ": nregions to assign: " + + nRegionsToAssign + +" and nregions: " + nregions + + " metaAssign: " + isMetaAssign); nRegionsToAssign -= nregions; - if (nRegionsToAssign > 0) { + if (nRegionsToAssign > 0 || isMetaAssign) { // We still have more regions to assign. See how many we can assign // before this server becomes more heavily loaded than the next // most heavily loaded server. @@ -246,6 +261,8 @@ // continue; } + LOG.debug("Doing for " + serverName + " nregions: " + nregions + + " and nRegionsToAssign: " + nRegionsToAssign); if (nregions < nRegionsToAssign) { // There are some more heavily loaded servers // but we can't assign all the regions to this server. @@ -308,8 +325,33 @@ LOG.info("Assigning region " + Bytes.toString(regionName) + " to " + serverName); rs.setPendingOpen(serverName); this.regionsInTransition.put(regionName, rs); - this.historian.addRegionAssignment(rs.getRegionInfo(), - serverName); + + + // Since the meta/root may not be available at this moment, we + try { + // TODO move this into an actual class, and use the RetryableMetaOperation + master.toDoQueue.put( + new RegionServerOperation(master) { + protected boolean process() throws IOException { + if (!rootAvailable() || !metaTableAvailable()) { + return true; // the two above us will put us on the delayed queue + } + + // this call can cause problems if meta/root is offline! + historian.addRegionAssignment(rs.getRegionInfo(), + serverName); + return true; + } + public String toString() { + return "RegionAssignmentHistorian from " + serverName; + } + } + ); + } catch (InterruptedException e) { + // ignore and don't write the region historian + LOG.info("doRegionAssignment: Couldn't queue the region historian due to exception: " + e); + } + returnMsgs.add(new HMsg(HMsg.Type.MSG_REGION_OPEN, rs.getRegionInfo())); } @@ -357,18 +399,40 @@ * only caller (assignRegions, whose caller is ServerManager.processMsgs) owns * the monitor for RegionManager */ - private Set regionsAwaitingAssignment() { + private Set regionsAwaitingAssignment(HServerAddress addr, + boolean isSingleServer) { // set of regions we want to assign to this server Set regionsToAssign = new HashSet(); - - // Look over the set of regions that aren't currently assigned to + + boolean isMetaServer = isMetaServer(addr); + + // Handle if root is unassigned... only assign root if root is offline. + RegionState rootState = regionsInTransition.get(HRegionInfo.ROOT_REGIONINFO.getRegionName()); + if (rootState != null && rootState.isUnassigned()) { + // make sure root isnt assigned here first. + // if so return 'empty list' + // by definition there is no way this could be a ROOT region (since it's + // unassigned) so just make sure it isn't hosting META regions. + if (!isMetaServer) { + regionsToAssign.add(rootState); + } + return regionsToAssign; + } + + // Look over the set of regions that aren't currently assigned to // determine which we should assign to this server. + boolean reassigningMetas = numberOfMetaRegions.get() != onlineMetaRegions.size(); + boolean isMetaOrRoot = isMetaServer || isRootServer(addr); + if (reassigningMetas && isMetaOrRoot && !isSingleServer) { + return regionsToAssign; // dont assign anything to this server. + } + for (RegionState s: regionsInTransition.values()) { HRegionInfo i = s.getRegionInfo(); if (i == null) { continue; } - if (numberOfMetaRegions.get() != onlineMetaRegions.size() && + if (reassigningMetas && !i.isMetaRegion()) { // Can't assign user regions until all meta regions have been assigned // and are on-line @@ -457,7 +521,7 @@ } LOG.info("Skipped " + skipped + " region(s) that are in transition states"); } - + static class TableDirFilter implements PathFilter { public boolean accept(Path path) { @@ -607,7 +671,7 @@ Bytes.toString(HConstants.ROOT_TABLE_NAME)); } metaRegions.add(new MetaRegion(rootRegionLocation.get(), - HRegionInfo.ROOT_REGIONINFO.getRegionName())); + HRegionInfo.ROOT_REGIONINFO)); } else { if (!areAllMetaRegionsOnline()) { throw new NotAllMetaRegionsOnlineException(); @@ -685,7 +749,7 @@ * @return list of MetaRegion objects */ public List getListOfOnlineMetaRegions() { - List regions = null; + List regions; synchronized(onlineMetaRegions) { regions = new ArrayList(onlineMetaRegions.values()); } @@ -712,11 +776,104 @@ /** * Set an online MetaRegion offline - remove it from the map. * @param startKey region name + * @return the MetaRegion that was taken offline. */ - public void offlineMetaRegion(byte [] startKey) { - onlineMetaRegions.remove(startKey); + public MetaRegion offlineMetaRegion(byte [] startKey) { + LOG.info("META region removed from onlineMetaRegions"); + return onlineMetaRegions.remove(startKey); } - + + public boolean isRootServer(HServerAddress server) { + if (master.getRootRegionLocation() != null + && server.equals(master.getRootRegionLocation())) + return true; + return false; + } + + /** + * Returns the list of byte[] start-keys for any .META. regions hosted + * on the indicated server. + * + * @param server server address + * @return list of meta region start-keys. + */ + public List listMetaRegionsForServer(HServerAddress server) { + List metas = new ArrayList(); + + for ( MetaRegion region : onlineMetaRegions.values() ) { + if (server.equals(region.getServer())) { + metas.add(region.getStartKey()); + } + } + + return metas; + } + + /** + * Does this server have any META regions open on it, or any meta + * regions being assigned to it? + * + * @param server Server IP:port + * @return true if server has meta region assigned + */ + public boolean isMetaServer(HServerAddress server) { + for ( MetaRegion region : onlineMetaRegions.values() ) { + if (server.equals(region.getServer())) { + return true; + } + } + + // This might be expensive, but we need to make sure we dont + // get double assignment to the same regionserver. + for (RegionState s : regionsInTransition.values()) { + if (s.getRegionInfo().isMetaRegion() + && !s.isUnassigned() + && s.getServerName() != null + && s.getServerName().equals(server.toString())) { + // Has an outstanding meta region to be assigned. + return true; + } + } + return false; + } + + /** + * Call to take this metaserver offline for immediate reassignment. Used only + * when we know a region has shut down cleanly. + * + * A meta server is a server that hosts either -ROOT- or any .META. regions. + * + * If you are considering a unclean shutdown potentially, use ProcessServerShutdown which + * calls other methods to immediately unassign root/meta but delay the reassign until the + * log has been split. + * + * @param server the server that went down + * @return true if this was in fact a meta server, false if it did not carry meta regions. + */ + public synchronized boolean offlineMetaServer(HServerAddress server) { + boolean hasMeta = false; + + // check to see if ROOT and/or .META. are on this server, reassign them. + // use master.getRootRegionLocation. + if (master.getRootRegionLocation() != null && + server.equals(master.getRootRegionLocation())) { + LOG.info("Offlined ROOT server: " + server); + reassignRootRegion(); + hasMeta = true; + } + // AND + for ( MetaRegion region : onlineMetaRegions.values() ) { + if (server.equals(region.getServer())) { + LOG.info("Offlining META region: " + region); + offlineMetaRegion(region.getStartKey()); + // Set for reassignment. + setUnassigned(region.getRegionInfo(), true); + hasMeta = true; + } + } + return hasMeta; + } + /** * Remove a region from the region state map. * @@ -1282,4 +1439,4 @@ return Bytes.compareTo(getRegionName(), o.getRegionName()); } } -} \ No newline at end of file +} Modified: hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/RegionServerOperation.java URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/RegionServerOperation.java?rev=780467&r1=780466&r2=780467&view=diff ============================================================================== --- hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/RegionServerOperation.java (original) +++ hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/RegionServerOperation.java Sun May 31 16:28:03 2009 @@ -89,6 +89,14 @@ } return available; } - + + public int compareTo(RegionServerOperation other) { + return getPriority() - other.getPriority(); + } + + // the Priority of this operation, 0 is lowest priority + protected int getPriority() { + return Integer.MAX_VALUE; + } protected abstract boolean process() throws IOException; } \ No newline at end of file Modified: hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/RetryableMetaOperation.java URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/RetryableMetaOperation.java?rev=780467&r1=780466&r2=780467&view=diff ============================================================================== --- hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/RetryableMetaOperation.java (original) +++ hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/RetryableMetaOperation.java Sun May 31 16:28:03 2009 @@ -90,6 +90,7 @@ exceptions.add(e); } } catch (Exception e) { + LOG.debug("Exception in RetryableMetaOperation: ", e); throw new RuntimeException(e); } sleeper.sleep(); Modified: hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/RootScanner.java URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/RootScanner.java?rev=780467&r1=780466&r2=780467&view=diff ============================================================================== --- hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/RootScanner.java (original) +++ hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/RootScanner.java Sun May 31 16:28:03 2009 @@ -53,7 +53,7 @@ synchronized(scannerLock) { if (master.getRootRegionLocation() != null) { scanRegion(new MetaRegion(master.getRootRegionLocation(), - HRegionInfo.ROOT_REGIONINFO.getRegionName())); + HRegionInfo.ROOT_REGIONINFO)); } } } catch (IOException e) { Modified: hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/ServerManager.java URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/ServerManager.java?rev=780467&r1=780466&r2=780467&view=diff ============================================================================== --- hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/ServerManager.java (original) +++ hadoop/hbase/branches/0.19/src/java/org/apache/hadoop/hbase/master/ServerManager.java Sun May 31 16:28:03 2009 @@ -186,14 +186,9 @@ // The startup message was from a known server with the same name. // Timeout the old one right away. HServerAddress root = master.getRootRegionLocation(); - boolean rootServer = false; - if (root != null && root.equals(storedInfo.getServerAddress())) { - master.regionManager.unsetRootRegion(); - rootServer = true; - } try { master.toDoQueue.put( - new ProcessServerShutdown(master, storedInfo, rootServer)); + new ProcessServerShutdown(master, storedInfo)); } catch (InterruptedException e) { LOG.error("Insertion into toDoQueue was interrupted", e); } @@ -320,13 +315,9 @@ for (int i = 1; i < msgs.length; i++) { LOG.info("Processing " + msgs[i] + " from " + serverName); HRegionInfo info = msgs[i].getRegionInfo(); - synchronized (master.regionManager) { - if (info.isRootRegion()) { - master.regionManager.reassignRootRegion(); - } else { - if (info.isMetaTable()) { - master.regionManager.offlineMetaRegion(info.getStartKey()); - } + // Meta/root region offlining is handed in removeServerInfo above. + if (!info.isMetaRegion()) { + synchronized (master.regionManager) { if (!master.regionManager.isOfflined(info.getRegionName())) { master.regionManager.setUnassigned(info, true); } else { @@ -628,10 +619,7 @@ // This method can be called a couple of times during shutdown. if (info != null) { LOG.info("Cancelling lease for " + serverName); - if (master.getRootRegionLocation() != null && - info.getServerAddress().equals(master.getRootRegionLocation())) { - master.regionManager.unsetRootRegion(); - } + master.regionManager.offlineMetaServer(info.getServerAddress()); try { serverLeases.cancelLease(serverName); } catch (LeaseException e) { @@ -774,16 +762,7 @@ LOG.info(server + " lease expired"); // Remove the server from the known servers list and update load info HServerInfo info = serversToServerInfo.remove(server); - boolean rootServer = false; if (info != null) { - HServerAddress root = master.getRootRegionLocation(); - if (root != null && root.equals(info.getServerAddress())) { - // NOTE: If the server was serving the root region, we cannot reassign - // it here because the new server will start serving the root region - // before ProcessServerShutdown has a chance to split the log file. - master.regionManager.unsetRootRegion(); - rootServer = true; - } String serverName = info.getServerAddress().toString(); HServerLoad load = serversToLoad.remove(serverName); if (load != null) { @@ -797,8 +776,7 @@ } deadServers.put(server, Boolean.FALSE); try { - master.toDoQueue.put( - new ProcessServerShutdown(master, info, rootServer)); + master.toDoQueue.put(new ProcessServerShutdown(master, info)); } catch (InterruptedException e) { LOG.error("insert into toDoQueue was interrupted", e); }